{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999311910823643, "eval_steps": 500, "global_step": 3633, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002752356705429024, "grad_norm": 1.190276107072263, "learning_rate": 1.0810810810810812e-06, "loss": 1.3244, "step": 1 }, { "epoch": 0.0005504713410858048, "grad_norm": 1.2084409597297352, "learning_rate": 2.1621621621621623e-06, "loss": 1.2942, "step": 2 }, { "epoch": 0.0008257070116287071, "grad_norm": 1.122752677411088, "learning_rate": 3.2432432432432437e-06, "loss": 1.3128, "step": 3 }, { "epoch": 0.0011009426821716095, "grad_norm": 1.0924048720350894, "learning_rate": 4.324324324324325e-06, "loss": 1.3307, "step": 4 }, { "epoch": 0.0013761783527145117, "grad_norm": 0.9078676095521644, "learning_rate": 5.405405405405406e-06, "loss": 1.2978, "step": 5 }, { "epoch": 0.0016514140232574141, "grad_norm": 1.4160148465734577, "learning_rate": 6.486486486486487e-06, "loss": 1.2813, "step": 6 }, { "epoch": 0.0019266496938003166, "grad_norm": 1.20323318581664, "learning_rate": 7.567567567567569e-06, "loss": 1.258, "step": 7 }, { "epoch": 0.002201885364343219, "grad_norm": 1.7257200500394663, "learning_rate": 8.64864864864865e-06, "loss": 1.2677, "step": 8 }, { "epoch": 0.0024771210348861214, "grad_norm": 1.2460047678725148, "learning_rate": 9.729729729729732e-06, "loss": 1.2826, "step": 9 }, { "epoch": 0.0027523567054290234, "grad_norm": 2.927602556283335, "learning_rate": 1.0810810810810812e-05, "loss": 1.3088, "step": 10 }, { "epoch": 0.003027592375971926, "grad_norm": 2.3291183764604737, "learning_rate": 1.1891891891891894e-05, "loss": 1.336, "step": 11 }, { "epoch": 0.0033028280465148283, "grad_norm": 1.5611186960566894, "learning_rate": 1.2972972972972975e-05, "loss": 1.251, "step": 12 }, { "epoch": 0.0035780637170577307, "grad_norm": 1.507762186709395, "learning_rate": 1.4054054054054055e-05, "loss": 1.2392, "step": 13 }, { "epoch": 0.003853299387600633, "grad_norm": 1.3916628238578803, "learning_rate": 1.5135135135135138e-05, "loss": 1.2263, "step": 14 }, { "epoch": 0.004128535058143535, "grad_norm": 1.4410267212915282, "learning_rate": 1.6216216216216218e-05, "loss": 1.2525, "step": 15 }, { "epoch": 0.004403770728686438, "grad_norm": 1.3503388717753242, "learning_rate": 1.72972972972973e-05, "loss": 1.272, "step": 16 }, { "epoch": 0.00467900639922934, "grad_norm": 1.2435668788235288, "learning_rate": 1.8378378378378383e-05, "loss": 1.2334, "step": 17 }, { "epoch": 0.004954242069772243, "grad_norm": 1.332166272341274, "learning_rate": 1.9459459459459463e-05, "loss": 1.2487, "step": 18 }, { "epoch": 0.005229477740315145, "grad_norm": 1.3657862921427664, "learning_rate": 2.054054054054054e-05, "loss": 1.2216, "step": 19 }, { "epoch": 0.005504713410858047, "grad_norm": 1.3329043032392613, "learning_rate": 2.1621621621621624e-05, "loss": 1.2092, "step": 20 }, { "epoch": 0.00577994908140095, "grad_norm": 1.5624375594145068, "learning_rate": 2.2702702702702705e-05, "loss": 1.2051, "step": 21 }, { "epoch": 0.006055184751943852, "grad_norm": 1.4664415560783262, "learning_rate": 2.378378378378379e-05, "loss": 1.2252, "step": 22 }, { "epoch": 0.006330420422486755, "grad_norm": 1.5659900022345836, "learning_rate": 2.4864864864864866e-05, "loss": 1.1595, "step": 23 }, { "epoch": 0.006605656093029657, "grad_norm": 1.4447862720284197, "learning_rate": 2.594594594594595e-05, "loss": 1.1715, "step": 24 }, { "epoch": 0.006880891763572559, "grad_norm": 1.2617871597038033, "learning_rate": 2.702702702702703e-05, "loss": 1.1759, "step": 25 }, { "epoch": 0.0071561274341154614, "grad_norm": 1.7509911834922023, "learning_rate": 2.810810810810811e-05, "loss": 1.2134, "step": 26 }, { "epoch": 0.007431363104658363, "grad_norm": 1.1184189389114705, "learning_rate": 2.918918918918919e-05, "loss": 1.1652, "step": 27 }, { "epoch": 0.007706598775201266, "grad_norm": 1.6549112009023732, "learning_rate": 3.0270270270270275e-05, "loss": 1.1677, "step": 28 }, { "epoch": 0.00798183444574417, "grad_norm": 1.51189283106951, "learning_rate": 3.135135135135135e-05, "loss": 1.222, "step": 29 }, { "epoch": 0.00825707011628707, "grad_norm": 1.6570028356426652, "learning_rate": 3.2432432432432436e-05, "loss": 1.1138, "step": 30 }, { "epoch": 0.008532305786829973, "grad_norm": 1.4291449194042198, "learning_rate": 3.351351351351351e-05, "loss": 1.2045, "step": 31 }, { "epoch": 0.008807541457372876, "grad_norm": 1.8050828458250154, "learning_rate": 3.45945945945946e-05, "loss": 1.17, "step": 32 }, { "epoch": 0.009082777127915777, "grad_norm": 1.7028887034299325, "learning_rate": 3.567567567567568e-05, "loss": 1.117, "step": 33 }, { "epoch": 0.00935801279845868, "grad_norm": 1.7579705799438476, "learning_rate": 3.6756756756756765e-05, "loss": 1.1313, "step": 34 }, { "epoch": 0.009633248469001583, "grad_norm": 1.841565198358737, "learning_rate": 3.783783783783784e-05, "loss": 1.1352, "step": 35 }, { "epoch": 0.009908484139544486, "grad_norm": 1.51806968835451, "learning_rate": 3.8918918918918926e-05, "loss": 1.1459, "step": 36 }, { "epoch": 0.010183719810087387, "grad_norm": 2.0701574664333102, "learning_rate": 4e-05, "loss": 1.0888, "step": 37 }, { "epoch": 0.01045895548063029, "grad_norm": 1.6155327822146857, "learning_rate": 3.9999992367613554e-05, "loss": 1.079, "step": 38 }, { "epoch": 0.010734191151173193, "grad_norm": 1.672991458945634, "learning_rate": 3.999996947046004e-05, "loss": 1.0786, "step": 39 }, { "epoch": 0.011009426821716094, "grad_norm": 1.6923603269114176, "learning_rate": 3.999993130855694e-05, "loss": 1.1292, "step": 40 }, { "epoch": 0.011284662492258997, "grad_norm": 1.6163864527003113, "learning_rate": 3.999987788193337e-05, "loss": 1.0859, "step": 41 }, { "epoch": 0.0115598981628019, "grad_norm": 1.5568500772991893, "learning_rate": 3.9999809190630105e-05, "loss": 1.0795, "step": 42 }, { "epoch": 0.011835133833344802, "grad_norm": 1.5109463614457816, "learning_rate": 3.999972523469959e-05, "loss": 1.122, "step": 43 }, { "epoch": 0.012110369503887703, "grad_norm": 1.5782539419050023, "learning_rate": 3.9999626014205895e-05, "loss": 1.1123, "step": 44 }, { "epoch": 0.012385605174430606, "grad_norm": 1.5604642948556238, "learning_rate": 3.999951152922474e-05, "loss": 1.1017, "step": 45 }, { "epoch": 0.01266084084497351, "grad_norm": 1.3465924798161213, "learning_rate": 3.9999381779843526e-05, "loss": 1.1362, "step": 46 }, { "epoch": 0.01293607651551641, "grad_norm": 2.2197061958421416, "learning_rate": 3.999923676616125e-05, "loss": 1.149, "step": 47 }, { "epoch": 0.013211312186059313, "grad_norm": 1.275559559204292, "learning_rate": 3.9999076488288625e-05, "loss": 1.0965, "step": 48 }, { "epoch": 0.013486547856602216, "grad_norm": 2.135373580458631, "learning_rate": 3.999890094634796e-05, "loss": 1.1152, "step": 49 }, { "epoch": 0.013761783527145117, "grad_norm": 1.162458419635211, "learning_rate": 3.999871014047324e-05, "loss": 1.0577, "step": 50 }, { "epoch": 0.01403701919768802, "grad_norm": 1.668860695031362, "learning_rate": 3.99985040708101e-05, "loss": 1.0751, "step": 51 }, { "epoch": 0.014312254868230923, "grad_norm": 1.2352790911765088, "learning_rate": 3.9998282737515826e-05, "loss": 1.0523, "step": 52 }, { "epoch": 0.014587490538773826, "grad_norm": 1.6709188838849682, "learning_rate": 3.999804614075934e-05, "loss": 1.0922, "step": 53 }, { "epoch": 0.014862726209316727, "grad_norm": 1.2186730260814729, "learning_rate": 3.9997794280721215e-05, "loss": 1.0946, "step": 54 }, { "epoch": 0.01513796187985963, "grad_norm": 1.5946952239430119, "learning_rate": 3.999752715759368e-05, "loss": 1.0982, "step": 55 }, { "epoch": 0.015413197550402533, "grad_norm": 1.3338876549514764, "learning_rate": 3.999724477158064e-05, "loss": 1.0606, "step": 56 }, { "epoch": 0.015688433220945434, "grad_norm": 1.4391389628569506, "learning_rate": 3.9996947122897594e-05, "loss": 1.0699, "step": 57 }, { "epoch": 0.01596366889148834, "grad_norm": 1.4049265878583999, "learning_rate": 3.999663421177173e-05, "loss": 1.1227, "step": 58 }, { "epoch": 0.01623890456203124, "grad_norm": 1.3947889784751317, "learning_rate": 3.999630603844187e-05, "loss": 1.0886, "step": 59 }, { "epoch": 0.01651414023257414, "grad_norm": 1.4043255062790543, "learning_rate": 3.99959626031585e-05, "loss": 1.0719, "step": 60 }, { "epoch": 0.016789375903117045, "grad_norm": 1.3519206423249013, "learning_rate": 3.9995603906183726e-05, "loss": 1.0834, "step": 61 }, { "epoch": 0.017064611573659946, "grad_norm": 1.422472203995192, "learning_rate": 3.999522994779133e-05, "loss": 1.0501, "step": 62 }, { "epoch": 0.017339847244202847, "grad_norm": 1.4681842143007482, "learning_rate": 3.9994840728266725e-05, "loss": 1.0426, "step": 63 }, { "epoch": 0.017615082914745752, "grad_norm": 1.5623433348992308, "learning_rate": 3.999443624790699e-05, "loss": 1.0442, "step": 64 }, { "epoch": 0.017890318585288653, "grad_norm": 1.2844018013991325, "learning_rate": 3.999401650702083e-05, "loss": 1.0532, "step": 65 }, { "epoch": 0.018165554255831554, "grad_norm": 1.4124923773116047, "learning_rate": 3.999358150592861e-05, "loss": 1.047, "step": 66 }, { "epoch": 0.01844078992637446, "grad_norm": 1.47024691076268, "learning_rate": 3.999313124496234e-05, "loss": 1.0394, "step": 67 }, { "epoch": 0.01871602559691736, "grad_norm": 1.0479405707837466, "learning_rate": 3.9992665724465686e-05, "loss": 1.0159, "step": 68 }, { "epoch": 0.01899126126746026, "grad_norm": 1.6155895896664414, "learning_rate": 3.999218494479393e-05, "loss": 1.0412, "step": 69 }, { "epoch": 0.019266496938003166, "grad_norm": 1.1554035474855078, "learning_rate": 3.999168890631404e-05, "loss": 1.0444, "step": 70 }, { "epoch": 0.019541732608546067, "grad_norm": 1.3916486614853192, "learning_rate": 3.99911776094046e-05, "loss": 1.0716, "step": 71 }, { "epoch": 0.01981696827908897, "grad_norm": 1.0307232993736772, "learning_rate": 3.999065105445586e-05, "loss": 1.0518, "step": 72 }, { "epoch": 0.020092203949631873, "grad_norm": 1.519072854318784, "learning_rate": 3.99901092418697e-05, "loss": 1.0511, "step": 73 }, { "epoch": 0.020367439620174774, "grad_norm": 1.2452532414480466, "learning_rate": 3.998955217205966e-05, "loss": 1.0521, "step": 74 }, { "epoch": 0.02064267529071768, "grad_norm": 1.0823724764686737, "learning_rate": 3.998897984545091e-05, "loss": 1.0283, "step": 75 }, { "epoch": 0.02091791096126058, "grad_norm": 1.4547887075106387, "learning_rate": 3.9988392262480274e-05, "loss": 1.0381, "step": 76 }, { "epoch": 0.02119314663180348, "grad_norm": 0.9223667414315597, "learning_rate": 3.9987789423596224e-05, "loss": 1.0271, "step": 77 }, { "epoch": 0.021468382302346385, "grad_norm": 1.4947900030467116, "learning_rate": 3.998717132925886e-05, "loss": 1.0095, "step": 78 }, { "epoch": 0.021743617972889286, "grad_norm": 1.0246470847225004, "learning_rate": 3.998653797993995e-05, "loss": 1.0274, "step": 79 }, { "epoch": 0.022018853643432187, "grad_norm": 1.586742963776002, "learning_rate": 3.998588937612287e-05, "loss": 1.0084, "step": 80 }, { "epoch": 0.022294089313975092, "grad_norm": 1.1499654167390594, "learning_rate": 3.998522551830267e-05, "loss": 1.0515, "step": 81 }, { "epoch": 0.022569324984517993, "grad_norm": 1.543529968919974, "learning_rate": 3.9984546406986045e-05, "loss": 0.9933, "step": 82 }, { "epoch": 0.022844560655060894, "grad_norm": 0.9475881660146706, "learning_rate": 3.99838520426913e-05, "loss": 1.0159, "step": 83 }, { "epoch": 0.0231197963256038, "grad_norm": 1.351513155062428, "learning_rate": 3.998314242594841e-05, "loss": 0.9955, "step": 84 }, { "epoch": 0.0233950319961467, "grad_norm": 1.1492618928180698, "learning_rate": 3.998241755729897e-05, "loss": 1.0673, "step": 85 }, { "epoch": 0.023670267666689605, "grad_norm": 1.3497208081724184, "learning_rate": 3.9981677437296244e-05, "loss": 1.0341, "step": 86 }, { "epoch": 0.023945503337232506, "grad_norm": 1.1251159534366102, "learning_rate": 3.998092206650511e-05, "loss": 0.994, "step": 87 }, { "epoch": 0.024220739007775407, "grad_norm": 1.2840664657236376, "learning_rate": 3.99801514455021e-05, "loss": 1.0059, "step": 88 }, { "epoch": 0.02449597467831831, "grad_norm": 1.0389167650958175, "learning_rate": 3.997936557487539e-05, "loss": 1.0232, "step": 89 }, { "epoch": 0.024771210348861213, "grad_norm": 1.1934490371559106, "learning_rate": 3.9978564455224764e-05, "loss": 1.0154, "step": 90 }, { "epoch": 0.025046446019404114, "grad_norm": 1.3565126325882042, "learning_rate": 3.9977748087161696e-05, "loss": 1.012, "step": 91 }, { "epoch": 0.02532168168994702, "grad_norm": 1.261096742039059, "learning_rate": 3.997691647130924e-05, "loss": 1.0143, "step": 92 }, { "epoch": 0.02559691736048992, "grad_norm": 1.5361429916101677, "learning_rate": 3.997606960830214e-05, "loss": 0.9854, "step": 93 }, { "epoch": 0.02587215303103282, "grad_norm": 0.7587337794998066, "learning_rate": 3.997520749878675e-05, "loss": 0.9797, "step": 94 }, { "epoch": 0.026147388701575725, "grad_norm": 1.340673980750616, "learning_rate": 3.997433014342106e-05, "loss": 1.0083, "step": 95 }, { "epoch": 0.026422624372118626, "grad_norm": 1.442377076418099, "learning_rate": 3.99734375428747e-05, "loss": 1.0098, "step": 96 }, { "epoch": 0.026697860042661527, "grad_norm": 1.053217627466467, "learning_rate": 3.997252969782895e-05, "loss": 1.0194, "step": 97 }, { "epoch": 0.026973095713204432, "grad_norm": 1.2542749662949786, "learning_rate": 3.9971606608976694e-05, "loss": 1.0421, "step": 98 }, { "epoch": 0.027248331383747333, "grad_norm": 1.0310036113535672, "learning_rate": 3.997066827702248e-05, "loss": 1.0255, "step": 99 }, { "epoch": 0.027523567054290234, "grad_norm": 1.3425906351732504, "learning_rate": 3.996971470268248e-05, "loss": 1.0096, "step": 100 }, { "epoch": 0.02779880272483314, "grad_norm": 1.0033245136899922, "learning_rate": 3.9968745886684496e-05, "loss": 1.0278, "step": 101 }, { "epoch": 0.02807403839537604, "grad_norm": 1.2359786623522877, "learning_rate": 3.996776182976796e-05, "loss": 1.0016, "step": 102 }, { "epoch": 0.028349274065918945, "grad_norm": 0.8996790282725927, "learning_rate": 3.996676253268396e-05, "loss": 0.9778, "step": 103 }, { "epoch": 0.028624509736461846, "grad_norm": 1.122427683480799, "learning_rate": 3.996574799619518e-05, "loss": 1.0021, "step": 104 }, { "epoch": 0.028899745407004747, "grad_norm": 1.0186269169389894, "learning_rate": 3.996471822107596e-05, "loss": 0.9949, "step": 105 }, { "epoch": 0.02917498107754765, "grad_norm": 1.3313548328442482, "learning_rate": 3.996367320811227e-05, "loss": 0.9761, "step": 106 }, { "epoch": 0.029450216748090553, "grad_norm": 0.8643837745261176, "learning_rate": 3.9962612958101696e-05, "loss": 1.0035, "step": 107 }, { "epoch": 0.029725452418633454, "grad_norm": 0.98116052105504, "learning_rate": 3.996153747185347e-05, "loss": 0.9801, "step": 108 }, { "epoch": 0.03000068808917636, "grad_norm": 1.10691016989535, "learning_rate": 3.996044675018842e-05, "loss": 1.0151, "step": 109 }, { "epoch": 0.03027592375971926, "grad_norm": 1.2681369626856358, "learning_rate": 3.9959340793939064e-05, "loss": 1.0307, "step": 110 }, { "epoch": 0.03055115943026216, "grad_norm": 1.0276436414413004, "learning_rate": 3.9958219603949486e-05, "loss": 0.9759, "step": 111 }, { "epoch": 0.030826395100805065, "grad_norm": 1.420865033430615, "learning_rate": 3.995708318107543e-05, "loss": 0.9859, "step": 112 }, { "epoch": 0.031101630771347966, "grad_norm": 0.8500712282512342, "learning_rate": 3.995593152618425e-05, "loss": 1.0195, "step": 113 }, { "epoch": 0.03137686644189087, "grad_norm": 1.102496992794187, "learning_rate": 3.995476464015495e-05, "loss": 0.9947, "step": 114 }, { "epoch": 0.03165210211243377, "grad_norm": 1.3088189045845282, "learning_rate": 3.995358252387813e-05, "loss": 0.9779, "step": 115 }, { "epoch": 0.03192733778297668, "grad_norm": 0.9473377671663621, "learning_rate": 3.995238517825602e-05, "loss": 1.0109, "step": 116 }, { "epoch": 0.032202573453519574, "grad_norm": 0.9374545115515334, "learning_rate": 3.9951172604202494e-05, "loss": 0.9705, "step": 117 }, { "epoch": 0.03247780912406248, "grad_norm": 1.122739476374123, "learning_rate": 3.9949944802643036e-05, "loss": 0.9877, "step": 118 }, { "epoch": 0.032753044794605383, "grad_norm": 1.2181012183983715, "learning_rate": 3.994870177451474e-05, "loss": 0.9867, "step": 119 }, { "epoch": 0.03302828046514828, "grad_norm": 1.0190839936162515, "learning_rate": 3.994744352076634e-05, "loss": 0.9966, "step": 120 }, { "epoch": 0.033303516135691186, "grad_norm": 1.091612675267103, "learning_rate": 3.9946170042358185e-05, "loss": 0.987, "step": 121 }, { "epoch": 0.03357875180623409, "grad_norm": 0.9232945314206387, "learning_rate": 3.994488134026224e-05, "loss": 0.9963, "step": 122 }, { "epoch": 0.03385398747677699, "grad_norm": 1.0736749598534863, "learning_rate": 3.99435774154621e-05, "loss": 1.0157, "step": 123 }, { "epoch": 0.03412922314731989, "grad_norm": 1.2475900441860863, "learning_rate": 3.994225826895295e-05, "loss": 0.9846, "step": 124 }, { "epoch": 0.0344044588178628, "grad_norm": 0.8913702469448699, "learning_rate": 3.994092390174164e-05, "loss": 0.9947, "step": 125 }, { "epoch": 0.034679694488405695, "grad_norm": 1.0342284706230065, "learning_rate": 3.993957431484659e-05, "loss": 0.9563, "step": 126 }, { "epoch": 0.0349549301589486, "grad_norm": 1.1094942836810213, "learning_rate": 3.993820950929787e-05, "loss": 0.9564, "step": 127 }, { "epoch": 0.035230165829491504, "grad_norm": 1.1644167313950693, "learning_rate": 3.9936829486137145e-05, "loss": 1.005, "step": 128 }, { "epoch": 0.0355054015000344, "grad_norm": 1.0541434126030913, "learning_rate": 3.993543424641771e-05, "loss": 0.9629, "step": 129 }, { "epoch": 0.035780637170577306, "grad_norm": 1.2719003588455873, "learning_rate": 3.993402379120446e-05, "loss": 0.9779, "step": 130 }, { "epoch": 0.03605587284112021, "grad_norm": 0.9814772038816815, "learning_rate": 3.9932598121573906e-05, "loss": 0.9683, "step": 131 }, { "epoch": 0.03633110851166311, "grad_norm": 1.113127253040516, "learning_rate": 3.993115723861418e-05, "loss": 0.9484, "step": 132 }, { "epoch": 0.03660634418220601, "grad_norm": 0.9401838923203238, "learning_rate": 3.9929701143425014e-05, "loss": 0.9754, "step": 133 }, { "epoch": 0.03688157985274892, "grad_norm": 1.0452901290660856, "learning_rate": 3.992822983711776e-05, "loss": 0.9752, "step": 134 }, { "epoch": 0.037156815523291815, "grad_norm": 1.029671900565212, "learning_rate": 3.992674332081538e-05, "loss": 0.9897, "step": 135 }, { "epoch": 0.03743205119383472, "grad_norm": 1.0498191361206552, "learning_rate": 3.992524159565243e-05, "loss": 0.9637, "step": 136 }, { "epoch": 0.037707286864377625, "grad_norm": 1.0044264183964862, "learning_rate": 3.992372466277509e-05, "loss": 1.0147, "step": 137 }, { "epoch": 0.03798252253492052, "grad_norm": 1.0506663533652345, "learning_rate": 3.992219252334114e-05, "loss": 0.9392, "step": 138 }, { "epoch": 0.03825775820546343, "grad_norm": 1.21490462827667, "learning_rate": 3.992064517851998e-05, "loss": 1.0044, "step": 139 }, { "epoch": 0.03853299387600633, "grad_norm": 0.9708595219699383, "learning_rate": 3.9919082629492585e-05, "loss": 0.9724, "step": 140 }, { "epoch": 0.038808229546549236, "grad_norm": 1.0754728525997272, "learning_rate": 3.9917504877451563e-05, "loss": 0.9732, "step": 141 }, { "epoch": 0.039083465217092134, "grad_norm": 1.0770903291389418, "learning_rate": 3.991591192360112e-05, "loss": 0.9783, "step": 142 }, { "epoch": 0.03935870088763504, "grad_norm": 1.022206500658949, "learning_rate": 3.991430376915704e-05, "loss": 1.003, "step": 143 }, { "epoch": 0.03963393655817794, "grad_norm": 1.2010136364603878, "learning_rate": 3.991268041534676e-05, "loss": 0.9622, "step": 144 }, { "epoch": 0.03990917222872084, "grad_norm": 0.9904092925521828, "learning_rate": 3.991104186340926e-05, "loss": 0.9903, "step": 145 }, { "epoch": 0.040184407899263745, "grad_norm": 1.2395035830387846, "learning_rate": 3.990938811459516e-05, "loss": 0.974, "step": 146 }, { "epoch": 0.04045964356980665, "grad_norm": 0.7353479908408915, "learning_rate": 3.990771917016665e-05, "loss": 1.0046, "step": 147 }, { "epoch": 0.04073487924034955, "grad_norm": 0.9920011334434718, "learning_rate": 3.990603503139755e-05, "loss": 0.9755, "step": 148 }, { "epoch": 0.04101011491089245, "grad_norm": 1.107219145078194, "learning_rate": 3.9904335699573245e-05, "loss": 1.0003, "step": 149 }, { "epoch": 0.04128535058143536, "grad_norm": 0.8548007475694676, "learning_rate": 3.990262117599074e-05, "loss": 0.962, "step": 150 }, { "epoch": 0.041560586251978254, "grad_norm": 1.1104937026132244, "learning_rate": 3.990089146195863e-05, "loss": 0.9254, "step": 151 }, { "epoch": 0.04183582192252116, "grad_norm": 0.9909627357699444, "learning_rate": 3.98991465587971e-05, "loss": 0.9785, "step": 152 }, { "epoch": 0.042111057593064063, "grad_norm": 0.939332432355087, "learning_rate": 3.98973864678379e-05, "loss": 0.9868, "step": 153 }, { "epoch": 0.04238629326360696, "grad_norm": 0.9767033192925513, "learning_rate": 3.989561119042444e-05, "loss": 0.9537, "step": 154 }, { "epoch": 0.042661528934149866, "grad_norm": 1.0137423941705093, "learning_rate": 3.989382072791166e-05, "loss": 0.9414, "step": 155 }, { "epoch": 0.04293676460469277, "grad_norm": 0.9284874715319218, "learning_rate": 3.98920150816661e-05, "loss": 0.9842, "step": 156 }, { "epoch": 0.04321200027523567, "grad_norm": 0.9187385752846338, "learning_rate": 3.989019425306591e-05, "loss": 0.9935, "step": 157 }, { "epoch": 0.04348723594577857, "grad_norm": 0.9949928953759635, "learning_rate": 3.9888358243500825e-05, "loss": 0.9468, "step": 158 }, { "epoch": 0.04376247161632148, "grad_norm": 1.1920415793958623, "learning_rate": 3.988650705437214e-05, "loss": 0.93, "step": 159 }, { "epoch": 0.044037707286864375, "grad_norm": 0.9196365103589655, "learning_rate": 3.9884640687092775e-05, "loss": 0.9581, "step": 160 }, { "epoch": 0.04431294295740728, "grad_norm": 0.8949804148675419, "learning_rate": 3.9882759143087194e-05, "loss": 0.9922, "step": 161 }, { "epoch": 0.044588178627950184, "grad_norm": 0.7927064598466735, "learning_rate": 3.988086242379148e-05, "loss": 0.97, "step": 162 }, { "epoch": 0.04486341429849308, "grad_norm": 0.8246249743192302, "learning_rate": 3.987895053065327e-05, "loss": 0.9687, "step": 163 }, { "epoch": 0.045138649969035986, "grad_norm": 0.8539801260308312, "learning_rate": 3.9877023465131806e-05, "loss": 0.9226, "step": 164 }, { "epoch": 0.04541388563957889, "grad_norm": 0.8531812582063119, "learning_rate": 3.987508122869789e-05, "loss": 0.9457, "step": 165 }, { "epoch": 0.04568912131012179, "grad_norm": 0.846264771307889, "learning_rate": 3.987312382283391e-05, "loss": 0.9255, "step": 166 }, { "epoch": 0.04596435698066469, "grad_norm": 0.8755538095182204, "learning_rate": 3.9871151249033844e-05, "loss": 0.9525, "step": 167 }, { "epoch": 0.0462395926512076, "grad_norm": 0.8199336462623763, "learning_rate": 3.986916350880323e-05, "loss": 0.9228, "step": 168 }, { "epoch": 0.046514828321750495, "grad_norm": 0.775083736366338, "learning_rate": 3.986716060365919e-05, "loss": 0.9579, "step": 169 }, { "epoch": 0.0467900639922934, "grad_norm": 0.8015872286193737, "learning_rate": 3.986514253513042e-05, "loss": 0.9415, "step": 170 }, { "epoch": 0.047065299662836305, "grad_norm": 0.8947424426024164, "learning_rate": 3.986310930475719e-05, "loss": 0.9374, "step": 171 }, { "epoch": 0.04734053533337921, "grad_norm": 1.0177453148917428, "learning_rate": 3.986106091409133e-05, "loss": 0.9613, "step": 172 }, { "epoch": 0.04761577100392211, "grad_norm": 0.9995155260477647, "learning_rate": 3.9858997364696254e-05, "loss": 0.9489, "step": 173 }, { "epoch": 0.04789100667446501, "grad_norm": 1.0058466996884183, "learning_rate": 3.985691865814695e-05, "loss": 0.9396, "step": 174 }, { "epoch": 0.048166242345007916, "grad_norm": 0.9309539337174383, "learning_rate": 3.985482479602996e-05, "loss": 0.9404, "step": 175 }, { "epoch": 0.048441478015550814, "grad_norm": 0.8007602895755056, "learning_rate": 3.9852715779943404e-05, "loss": 0.9477, "step": 176 }, { "epoch": 0.04871671368609372, "grad_norm": 0.5607973976392494, "learning_rate": 3.985059161149696e-05, "loss": 0.9446, "step": 177 }, { "epoch": 0.04899194935663662, "grad_norm": 0.6269194351562074, "learning_rate": 3.984845229231189e-05, "loss": 0.9043, "step": 178 }, { "epoch": 0.04926718502717952, "grad_norm": 0.8498363383103973, "learning_rate": 3.984629782402098e-05, "loss": 0.9572, "step": 179 }, { "epoch": 0.049542420697722425, "grad_norm": 0.971173881009018, "learning_rate": 3.9844128208268634e-05, "loss": 0.9583, "step": 180 }, { "epoch": 0.04981765636826533, "grad_norm": 1.0056651754503767, "learning_rate": 3.9841943446710756e-05, "loss": 0.928, "step": 181 }, { "epoch": 0.05009289203880823, "grad_norm": 0.8850820710100574, "learning_rate": 3.983974354101486e-05, "loss": 0.9501, "step": 182 }, { "epoch": 0.05036812770935113, "grad_norm": 0.6584048206114839, "learning_rate": 3.983752849286e-05, "loss": 1.0529, "step": 183 }, { "epoch": 0.05064336337989404, "grad_norm": 0.5918767008080843, "learning_rate": 3.983529830393677e-05, "loss": 0.9018, "step": 184 }, { "epoch": 0.050918599050436934, "grad_norm": 0.7851287049394087, "learning_rate": 3.9833052975947356e-05, "loss": 0.9542, "step": 185 }, { "epoch": 0.05119383472097984, "grad_norm": 0.8516369448448238, "learning_rate": 3.9830792510605463e-05, "loss": 0.9326, "step": 186 }, { "epoch": 0.051469070391522743, "grad_norm": 0.7828498616864874, "learning_rate": 3.982851690963637e-05, "loss": 0.9725, "step": 187 }, { "epoch": 0.05174430606206564, "grad_norm": 0.7526449796432927, "learning_rate": 3.982622617477691e-05, "loss": 0.9741, "step": 188 }, { "epoch": 0.052019541732608546, "grad_norm": 0.7151234103635482, "learning_rate": 3.9823920307775464e-05, "loss": 0.9191, "step": 189 }, { "epoch": 0.05229477740315145, "grad_norm": 0.6920502159129005, "learning_rate": 3.982159931039194e-05, "loss": 0.9385, "step": 190 }, { "epoch": 0.05257001307369435, "grad_norm": 0.7381612546258817, "learning_rate": 3.981926318439782e-05, "loss": 0.9482, "step": 191 }, { "epoch": 0.05284524874423725, "grad_norm": 0.7023056208374715, "learning_rate": 3.981691193157614e-05, "loss": 0.9376, "step": 192 }, { "epoch": 0.05312048441478016, "grad_norm": 0.622343310183164, "learning_rate": 3.9814545553721456e-05, "loss": 0.9337, "step": 193 }, { "epoch": 0.053395720085323055, "grad_norm": 0.6795583093281086, "learning_rate": 3.981216405263987e-05, "loss": 0.9465, "step": 194 }, { "epoch": 0.05367095575586596, "grad_norm": 0.7025074228756656, "learning_rate": 3.980976743014905e-05, "loss": 0.9629, "step": 195 }, { "epoch": 0.053946191426408864, "grad_norm": 0.7154857620479277, "learning_rate": 3.9807355688078193e-05, "loss": 0.9609, "step": 196 }, { "epoch": 0.05422142709695176, "grad_norm": 0.6481641291088678, "learning_rate": 3.9804928828268015e-05, "loss": 0.9278, "step": 197 }, { "epoch": 0.054496662767494666, "grad_norm": 0.7075225654319801, "learning_rate": 3.980248685257081e-05, "loss": 0.9465, "step": 198 }, { "epoch": 0.05477189843803757, "grad_norm": 0.8236382811112433, "learning_rate": 3.980002976285037e-05, "loss": 0.9202, "step": 199 }, { "epoch": 0.05504713410858047, "grad_norm": 0.8492664853773008, "learning_rate": 3.9797557560982056e-05, "loss": 0.9491, "step": 200 }, { "epoch": 0.05532236977912337, "grad_norm": 0.8531323294396649, "learning_rate": 3.979507024885274e-05, "loss": 0.9361, "step": 201 }, { "epoch": 0.05559760544966628, "grad_norm": 1.2050949971243259, "learning_rate": 3.9792567828360843e-05, "loss": 0.939, "step": 202 }, { "epoch": 0.05587284112020918, "grad_norm": 0.6054954581158436, "learning_rate": 3.97900503014163e-05, "loss": 0.9502, "step": 203 }, { "epoch": 0.05614807679075208, "grad_norm": 0.7022132332194272, "learning_rate": 3.978751766994059e-05, "loss": 0.9512, "step": 204 }, { "epoch": 0.056423312461294985, "grad_norm": 0.7524655364093807, "learning_rate": 3.97849699358667e-05, "loss": 0.9378, "step": 205 }, { "epoch": 0.05669854813183789, "grad_norm": 0.8229128606866842, "learning_rate": 3.978240710113919e-05, "loss": 0.9252, "step": 206 }, { "epoch": 0.05697378380238079, "grad_norm": 0.8015210160942963, "learning_rate": 3.977982916771408e-05, "loss": 0.9628, "step": 207 }, { "epoch": 0.05724901947292369, "grad_norm": 0.6550145265716083, "learning_rate": 3.977723613755897e-05, "loss": 0.9351, "step": 208 }, { "epoch": 0.057524255143466596, "grad_norm": 0.783410134030528, "learning_rate": 3.9774628012652965e-05, "loss": 0.9026, "step": 209 }, { "epoch": 0.057799490814009494, "grad_norm": 0.8364287945657276, "learning_rate": 3.9772004794986665e-05, "loss": 0.9052, "step": 210 }, { "epoch": 0.0580747264845524, "grad_norm": 0.8053375394519587, "learning_rate": 3.976936648656223e-05, "loss": 0.8964, "step": 211 }, { "epoch": 0.0583499621550953, "grad_norm": 0.8625980591094503, "learning_rate": 3.976671308939331e-05, "loss": 0.9051, "step": 212 }, { "epoch": 0.0586251978256382, "grad_norm": 0.9634136358053212, "learning_rate": 3.976404460550509e-05, "loss": 0.8621, "step": 213 }, { "epoch": 0.058900433496181105, "grad_norm": 1.0650392212088253, "learning_rate": 3.976136103693424e-05, "loss": 0.9111, "step": 214 }, { "epoch": 0.05917566916672401, "grad_norm": 0.8740403534764651, "learning_rate": 3.9758662385728984e-05, "loss": 0.9366, "step": 215 }, { "epoch": 0.05945090483726691, "grad_norm": 0.6401848780793619, "learning_rate": 3.975594865394903e-05, "loss": 0.9537, "step": 216 }, { "epoch": 0.05972614050780981, "grad_norm": 0.6706070644677035, "learning_rate": 3.97532198436656e-05, "loss": 0.9362, "step": 217 }, { "epoch": 0.06000137617835272, "grad_norm": 0.7707750247530193, "learning_rate": 3.975047595696142e-05, "loss": 0.9437, "step": 218 }, { "epoch": 0.060276611848895614, "grad_norm": 0.7921067517201671, "learning_rate": 3.974771699593076e-05, "loss": 0.9515, "step": 219 }, { "epoch": 0.06055184751943852, "grad_norm": 0.7622632352659637, "learning_rate": 3.974494296267933e-05, "loss": 0.9137, "step": 220 }, { "epoch": 0.060827083189981423, "grad_norm": 0.7753020252514123, "learning_rate": 3.9742153859324403e-05, "loss": 0.9477, "step": 221 }, { "epoch": 0.06110231886052432, "grad_norm": 0.7746283948680501, "learning_rate": 3.9739349687994713e-05, "loss": 0.9404, "step": 222 }, { "epoch": 0.061377554531067226, "grad_norm": 0.6872362412419311, "learning_rate": 3.9736530450830525e-05, "loss": 0.9442, "step": 223 }, { "epoch": 0.06165279020161013, "grad_norm": 0.6563306084572785, "learning_rate": 3.9733696149983586e-05, "loss": 0.9379, "step": 224 }, { "epoch": 0.06192802587215303, "grad_norm": 0.7495256808670511, "learning_rate": 3.9730846787617145e-05, "loss": 0.9649, "step": 225 }, { "epoch": 0.06220326154269593, "grad_norm": 0.7568541570047215, "learning_rate": 3.972798236590595e-05, "loss": 0.8936, "step": 226 }, { "epoch": 0.06247849721323884, "grad_norm": 0.7666357668612135, "learning_rate": 3.972510288703622e-05, "loss": 0.9227, "step": 227 }, { "epoch": 0.06275373288378173, "grad_norm": 0.6930907336537482, "learning_rate": 3.9722208353205704e-05, "loss": 0.9552, "step": 228 }, { "epoch": 0.06302896855432465, "grad_norm": 0.6806540530714671, "learning_rate": 3.9719298766623614e-05, "loss": 0.9431, "step": 229 }, { "epoch": 0.06330420422486754, "grad_norm": 0.6249426098911484, "learning_rate": 3.971637412951066e-05, "loss": 0.9257, "step": 230 }, { "epoch": 0.06357943989541044, "grad_norm": 0.678997884529542, "learning_rate": 3.971343444409904e-05, "loss": 0.9324, "step": 231 }, { "epoch": 0.06385467556595335, "grad_norm": 0.6504053402255093, "learning_rate": 3.9710479712632435e-05, "loss": 0.9298, "step": 232 }, { "epoch": 0.06412991123649625, "grad_norm": 0.6813223741520832, "learning_rate": 3.9707509937366006e-05, "loss": 0.9234, "step": 233 }, { "epoch": 0.06440514690703915, "grad_norm": 0.5645584348910755, "learning_rate": 3.9704525120566406e-05, "loss": 0.899, "step": 234 }, { "epoch": 0.06468038257758206, "grad_norm": 0.6096834703891368, "learning_rate": 3.970152526451176e-05, "loss": 0.922, "step": 235 }, { "epoch": 0.06495561824812496, "grad_norm": 0.7075292059001774, "learning_rate": 3.969851037149167e-05, "loss": 0.9206, "step": 236 }, { "epoch": 0.06523085391866786, "grad_norm": 0.6718415377168108, "learning_rate": 3.969548044380722e-05, "loss": 0.8914, "step": 237 }, { "epoch": 0.06550608958921077, "grad_norm": 0.7192792263292144, "learning_rate": 3.969243548377098e-05, "loss": 0.95, "step": 238 }, { "epoch": 0.06578132525975366, "grad_norm": 0.6723385117598139, "learning_rate": 3.968937549370696e-05, "loss": 0.9259, "step": 239 }, { "epoch": 0.06605656093029656, "grad_norm": 0.653817726766455, "learning_rate": 3.9686300475950686e-05, "loss": 0.9126, "step": 240 }, { "epoch": 0.06633179660083947, "grad_norm": 0.6365110370621555, "learning_rate": 3.968321043284912e-05, "loss": 0.9198, "step": 241 }, { "epoch": 0.06660703227138237, "grad_norm": 0.6252107892810178, "learning_rate": 3.9680105366760686e-05, "loss": 0.9122, "step": 242 }, { "epoch": 0.06688226794192527, "grad_norm": 0.6904254889862481, "learning_rate": 3.9676985280055315e-05, "loss": 0.9172, "step": 243 }, { "epoch": 0.06715750361246818, "grad_norm": 0.6986909512580151, "learning_rate": 3.9673850175114375e-05, "loss": 0.9318, "step": 244 }, { "epoch": 0.06743273928301108, "grad_norm": 0.6788305029535093, "learning_rate": 3.9670700054330685e-05, "loss": 0.9428, "step": 245 }, { "epoch": 0.06770797495355398, "grad_norm": 0.5920166716231594, "learning_rate": 3.9667534920108545e-05, "loss": 0.9142, "step": 246 }, { "epoch": 0.06798321062409689, "grad_norm": 0.5856057224883433, "learning_rate": 3.966435477486371e-05, "loss": 0.9186, "step": 247 }, { "epoch": 0.06825844629463979, "grad_norm": 0.6403810123397401, "learning_rate": 3.966115962102339e-05, "loss": 0.926, "step": 248 }, { "epoch": 0.06853368196518268, "grad_norm": 0.7177599031300991, "learning_rate": 3.965794946102625e-05, "loss": 0.913, "step": 249 }, { "epoch": 0.0688089176357256, "grad_norm": 0.6657273816333562, "learning_rate": 3.9654724297322406e-05, "loss": 0.9264, "step": 250 }, { "epoch": 0.06908415330626849, "grad_norm": 0.7383600410114154, "learning_rate": 3.965148413237342e-05, "loss": 0.9296, "step": 251 }, { "epoch": 0.06935938897681139, "grad_norm": 0.5409635282957039, "learning_rate": 3.964822896865234e-05, "loss": 0.9117, "step": 252 }, { "epoch": 0.0696346246473543, "grad_norm": 0.6016959320603094, "learning_rate": 3.96449588086436e-05, "loss": 0.9111, "step": 253 }, { "epoch": 0.0699098603178972, "grad_norm": 0.6359959140668964, "learning_rate": 3.964167365484312e-05, "loss": 0.8903, "step": 254 }, { "epoch": 0.0701850959884401, "grad_norm": 0.7068907835626644, "learning_rate": 3.9638373509758274e-05, "loss": 0.9083, "step": 255 }, { "epoch": 0.07046033165898301, "grad_norm": 0.7151486793125481, "learning_rate": 3.9635058375907836e-05, "loss": 0.9502, "step": 256 }, { "epoch": 0.0707355673295259, "grad_norm": 0.6389786677951766, "learning_rate": 3.963172825582206e-05, "loss": 0.9124, "step": 257 }, { "epoch": 0.0710108030000688, "grad_norm": 0.5910610648092535, "learning_rate": 3.962838315204262e-05, "loss": 0.9242, "step": 258 }, { "epoch": 0.07128603867061171, "grad_norm": 0.543348196410837, "learning_rate": 3.962502306712263e-05, "loss": 0.9436, "step": 259 }, { "epoch": 0.07156127434115461, "grad_norm": 0.4759432647751585, "learning_rate": 3.962164800362662e-05, "loss": 0.94, "step": 260 }, { "epoch": 0.07183651001169751, "grad_norm": 0.5489501422865493, "learning_rate": 3.961825796413059e-05, "loss": 0.894, "step": 261 }, { "epoch": 0.07211174568224042, "grad_norm": 0.5545140012133268, "learning_rate": 3.9614852951221945e-05, "loss": 0.9268, "step": 262 }, { "epoch": 0.07238698135278332, "grad_norm": 0.5176304561516295, "learning_rate": 3.961143296749952e-05, "loss": 0.9018, "step": 263 }, { "epoch": 0.07266221702332622, "grad_norm": 0.4936550228340853, "learning_rate": 3.960799801557357e-05, "loss": 0.9271, "step": 264 }, { "epoch": 0.07293745269386913, "grad_norm": 0.4649125188981785, "learning_rate": 3.9604548098065796e-05, "loss": 0.9009, "step": 265 }, { "epoch": 0.07321268836441203, "grad_norm": 0.45983355235823387, "learning_rate": 3.96010832176093e-05, "loss": 0.9095, "step": 266 }, { "epoch": 0.07348792403495492, "grad_norm": 0.46023598503687446, "learning_rate": 3.9597603376848614e-05, "loss": 0.9525, "step": 267 }, { "epoch": 0.07376315970549784, "grad_norm": 0.43826928991196545, "learning_rate": 3.959410857843969e-05, "loss": 0.9357, "step": 268 }, { "epoch": 0.07403839537604073, "grad_norm": 0.44969426028379234, "learning_rate": 3.9590598825049896e-05, "loss": 0.9052, "step": 269 }, { "epoch": 0.07431363104658363, "grad_norm": 0.4172117173156426, "learning_rate": 3.9587074119358e-05, "loss": 0.9029, "step": 270 }, { "epoch": 0.07458886671712654, "grad_norm": 0.4484248145931521, "learning_rate": 3.95835344640542e-05, "loss": 0.9308, "step": 271 }, { "epoch": 0.07486410238766944, "grad_norm": 0.5135846985483652, "learning_rate": 3.957997986184011e-05, "loss": 0.9143, "step": 272 }, { "epoch": 0.07513933805821234, "grad_norm": 0.48945107499007995, "learning_rate": 3.957641031542872e-05, "loss": 0.9235, "step": 273 }, { "epoch": 0.07541457372875525, "grad_norm": 0.5754430181789083, "learning_rate": 3.957282582754445e-05, "loss": 0.9134, "step": 274 }, { "epoch": 0.07568980939929815, "grad_norm": 0.5531912332257736, "learning_rate": 3.9569226400923135e-05, "loss": 0.9126, "step": 275 }, { "epoch": 0.07596504506984104, "grad_norm": 0.6388268251950844, "learning_rate": 3.956561203831198e-05, "loss": 0.8906, "step": 276 }, { "epoch": 0.07624028074038396, "grad_norm": 0.7349885262264578, "learning_rate": 3.9561982742469606e-05, "loss": 0.9171, "step": 277 }, { "epoch": 0.07651551641092685, "grad_norm": 0.6680870684670991, "learning_rate": 3.955833851616604e-05, "loss": 0.873, "step": 278 }, { "epoch": 0.07679075208146977, "grad_norm": 0.6461308971358634, "learning_rate": 3.95546793621827e-05, "loss": 0.9046, "step": 279 }, { "epoch": 0.07706598775201266, "grad_norm": 0.6300989584233229, "learning_rate": 3.955100528331238e-05, "loss": 0.8672, "step": 280 }, { "epoch": 0.07734122342255556, "grad_norm": 0.6351347433080852, "learning_rate": 3.9547316282359284e-05, "loss": 0.9448, "step": 281 }, { "epoch": 0.07761645909309847, "grad_norm": 0.953155967281556, "learning_rate": 3.954361236213901e-05, "loss": 0.9118, "step": 282 }, { "epoch": 0.07789169476364137, "grad_norm": 0.7702100160420944, "learning_rate": 3.9539893525478524e-05, "loss": 0.9099, "step": 283 }, { "epoch": 0.07816693043418427, "grad_norm": 0.8391527574702072, "learning_rate": 3.9536159775216185e-05, "loss": 0.9096, "step": 284 }, { "epoch": 0.07844216610472718, "grad_norm": 0.925785241620294, "learning_rate": 3.953241111420174e-05, "loss": 0.9365, "step": 285 }, { "epoch": 0.07871740177527008, "grad_norm": 1.0370904205356115, "learning_rate": 3.9528647545296306e-05, "loss": 0.9076, "step": 286 }, { "epoch": 0.07899263744581297, "grad_norm": 0.8333618823361661, "learning_rate": 3.952486907137239e-05, "loss": 0.9239, "step": 287 }, { "epoch": 0.07926787311635589, "grad_norm": 0.616154186568232, "learning_rate": 3.9521075695313864e-05, "loss": 0.9181, "step": 288 }, { "epoch": 0.07954310878689878, "grad_norm": 0.6315972529810878, "learning_rate": 3.951726742001599e-05, "loss": 0.8923, "step": 289 }, { "epoch": 0.07981834445744168, "grad_norm": 0.751290814003144, "learning_rate": 3.951344424838538e-05, "loss": 0.9555, "step": 290 }, { "epoch": 0.08009358012798459, "grad_norm": 0.6998218974846498, "learning_rate": 3.9509606183340026e-05, "loss": 0.8874, "step": 291 }, { "epoch": 0.08036881579852749, "grad_norm": 0.7352815171351497, "learning_rate": 3.950575322780929e-05, "loss": 0.9089, "step": 292 }, { "epoch": 0.08064405146907039, "grad_norm": 0.7023022013545431, "learning_rate": 3.9501885384733906e-05, "loss": 0.909, "step": 293 }, { "epoch": 0.0809192871396133, "grad_norm": 0.6749091557752724, "learning_rate": 3.949800265706595e-05, "loss": 0.8704, "step": 294 }, { "epoch": 0.0811945228101562, "grad_norm": 0.7312732879936177, "learning_rate": 3.949410504776887e-05, "loss": 0.8886, "step": 295 }, { "epoch": 0.0814697584806991, "grad_norm": 0.655108696795065, "learning_rate": 3.949019255981747e-05, "loss": 0.942, "step": 296 }, { "epoch": 0.081744994151242, "grad_norm": 0.6761260309562537, "learning_rate": 3.948626519619793e-05, "loss": 0.908, "step": 297 }, { "epoch": 0.0820202298217849, "grad_norm": 0.5891569339909765, "learning_rate": 3.9482322959907745e-05, "loss": 0.8947, "step": 298 }, { "epoch": 0.0822954654923278, "grad_norm": 0.5268646031762036, "learning_rate": 3.947836585395579e-05, "loss": 0.8896, "step": 299 }, { "epoch": 0.08257070116287071, "grad_norm": 0.5572584026668544, "learning_rate": 3.947439388136228e-05, "loss": 0.9279, "step": 300 }, { "epoch": 0.08284593683341361, "grad_norm": 0.6016927517733835, "learning_rate": 3.947040704515878e-05, "loss": 0.9121, "step": 301 }, { "epoch": 0.08312117250395651, "grad_norm": 0.6196112117746112, "learning_rate": 3.94664053483882e-05, "loss": 0.9493, "step": 302 }, { "epoch": 0.08339640817449942, "grad_norm": 0.5729651894313627, "learning_rate": 3.946238879410478e-05, "loss": 0.9029, "step": 303 }, { "epoch": 0.08367164384504232, "grad_norm": 0.5416308933869269, "learning_rate": 3.9458357385374116e-05, "loss": 0.9092, "step": 304 }, { "epoch": 0.08394687951558522, "grad_norm": 0.5781810718082886, "learning_rate": 3.945431112527314e-05, "loss": 0.964, "step": 305 }, { "epoch": 0.08422211518612813, "grad_norm": 0.5592511163596525, "learning_rate": 3.94502500168901e-05, "loss": 0.903, "step": 306 }, { "epoch": 0.08449735085667102, "grad_norm": 0.5541896438592137, "learning_rate": 3.944617406332461e-05, "loss": 0.8853, "step": 307 }, { "epoch": 0.08477258652721392, "grad_norm": 0.5489645129909775, "learning_rate": 3.944208326768758e-05, "loss": 0.911, "step": 308 }, { "epoch": 0.08504782219775683, "grad_norm": 0.5625633636794496, "learning_rate": 3.9437977633101266e-05, "loss": 0.8833, "step": 309 }, { "epoch": 0.08532305786829973, "grad_norm": 0.5448235271657504, "learning_rate": 3.9433857162699245e-05, "loss": 0.8645, "step": 310 }, { "epoch": 0.08559829353884263, "grad_norm": 0.6148293436890931, "learning_rate": 3.9429721859626434e-05, "loss": 0.8982, "step": 311 }, { "epoch": 0.08587352920938554, "grad_norm": 0.5856838059042696, "learning_rate": 3.942557172703903e-05, "loss": 0.8764, "step": 312 }, { "epoch": 0.08614876487992844, "grad_norm": 0.6627976946036365, "learning_rate": 3.94214067681046e-05, "loss": 0.8854, "step": 313 }, { "epoch": 0.08642400055047134, "grad_norm": 0.723862149221964, "learning_rate": 3.9417226986001994e-05, "loss": 0.9025, "step": 314 }, { "epoch": 0.08669923622101425, "grad_norm": 0.649076526741882, "learning_rate": 3.9413032383921374e-05, "loss": 0.8537, "step": 315 }, { "epoch": 0.08697447189155715, "grad_norm": 0.5897716399851319, "learning_rate": 3.940882296506423e-05, "loss": 0.9179, "step": 316 }, { "epoch": 0.08724970756210004, "grad_norm": 0.5707783248675382, "learning_rate": 3.940459873264336e-05, "loss": 0.9182, "step": 317 }, { "epoch": 0.08752494323264295, "grad_norm": 0.5739789279770842, "learning_rate": 3.940035968988284e-05, "loss": 0.8827, "step": 318 }, { "epoch": 0.08780017890318585, "grad_norm": 0.587354108382882, "learning_rate": 3.939610584001809e-05, "loss": 0.9102, "step": 319 }, { "epoch": 0.08807541457372875, "grad_norm": 0.5812972779745122, "learning_rate": 3.9391837186295816e-05, "loss": 0.8915, "step": 320 }, { "epoch": 0.08835065024427166, "grad_norm": 0.5602934710906348, "learning_rate": 3.9387553731974e-05, "loss": 0.8849, "step": 321 }, { "epoch": 0.08862588591481456, "grad_norm": 0.5928749356180759, "learning_rate": 3.9383255480321955e-05, "loss": 0.896, "step": 322 }, { "epoch": 0.08890112158535746, "grad_norm": 0.4846448604955536, "learning_rate": 3.937894243462027e-05, "loss": 0.894, "step": 323 }, { "epoch": 0.08917635725590037, "grad_norm": 0.5364236454230179, "learning_rate": 3.937461459816082e-05, "loss": 0.9165, "step": 324 }, { "epoch": 0.08945159292644327, "grad_norm": 0.575559207025926, "learning_rate": 3.937027197424679e-05, "loss": 0.901, "step": 325 }, { "epoch": 0.08972682859698616, "grad_norm": 0.647872054045973, "learning_rate": 3.9365914566192635e-05, "loss": 0.8753, "step": 326 }, { "epoch": 0.09000206426752907, "grad_norm": 0.6496703700196986, "learning_rate": 3.936154237732409e-05, "loss": 0.9088, "step": 327 }, { "epoch": 0.09027729993807197, "grad_norm": 0.6185755595736986, "learning_rate": 3.9357155410978184e-05, "loss": 0.9084, "step": 328 }, { "epoch": 0.09055253560861487, "grad_norm": 0.6250487984329662, "learning_rate": 3.9352753670503216e-05, "loss": 0.9227, "step": 329 }, { "epoch": 0.09082777127915778, "grad_norm": 0.60775145646772, "learning_rate": 3.934833715925877e-05, "loss": 0.8739, "step": 330 }, { "epoch": 0.09110300694970068, "grad_norm": 0.6171269689039475, "learning_rate": 3.934390588061569e-05, "loss": 0.8905, "step": 331 }, { "epoch": 0.09137824262024358, "grad_norm": 0.5524209601199233, "learning_rate": 3.933945983795611e-05, "loss": 0.8986, "step": 332 }, { "epoch": 0.09165347829078649, "grad_norm": 0.6016660474331017, "learning_rate": 3.933499903467341e-05, "loss": 0.9203, "step": 333 }, { "epoch": 0.09192871396132939, "grad_norm": 0.6058072777994923, "learning_rate": 3.933052347417225e-05, "loss": 0.9331, "step": 334 }, { "epoch": 0.09220394963187228, "grad_norm": 0.7352857206810751, "learning_rate": 3.932603315986856e-05, "loss": 0.8583, "step": 335 }, { "epoch": 0.0924791853024152, "grad_norm": 0.729197233429657, "learning_rate": 3.932152809518951e-05, "loss": 0.8843, "step": 336 }, { "epoch": 0.0927544209729581, "grad_norm": 0.6360878987285922, "learning_rate": 3.931700828357355e-05, "loss": 0.9146, "step": 337 }, { "epoch": 0.09302965664350099, "grad_norm": 0.5538393227342899, "learning_rate": 3.9312473728470364e-05, "loss": 0.8909, "step": 338 }, { "epoch": 0.0933048923140439, "grad_norm": 0.5572788719586435, "learning_rate": 3.9307924433340906e-05, "loss": 0.9228, "step": 339 }, { "epoch": 0.0935801279845868, "grad_norm": 0.6080360349844633, "learning_rate": 3.930336040165738e-05, "loss": 0.8727, "step": 340 }, { "epoch": 0.09385536365512971, "grad_norm": 0.794099959407607, "learning_rate": 3.9298781636903215e-05, "loss": 0.9092, "step": 341 }, { "epoch": 0.09413059932567261, "grad_norm": 0.6791574529442961, "learning_rate": 3.929418814257311e-05, "loss": 0.8966, "step": 342 }, { "epoch": 0.0944058349962155, "grad_norm": 0.5248090948237437, "learning_rate": 3.9289579922173e-05, "loss": 0.896, "step": 343 }, { "epoch": 0.09468107066675842, "grad_norm": 0.5284805439594273, "learning_rate": 3.9284956979220056e-05, "loss": 0.8968, "step": 344 }, { "epoch": 0.09495630633730132, "grad_norm": 0.550975681076434, "learning_rate": 3.928031931724269e-05, "loss": 0.9246, "step": 345 }, { "epoch": 0.09523154200784421, "grad_norm": 0.505929866249553, "learning_rate": 3.927566693978053e-05, "loss": 0.8796, "step": 346 }, { "epoch": 0.09550677767838713, "grad_norm": 0.5080069783013221, "learning_rate": 3.927099985038446e-05, "loss": 0.9042, "step": 347 }, { "epoch": 0.09578201334893002, "grad_norm": 0.5053139389182594, "learning_rate": 3.926631805261659e-05, "loss": 0.897, "step": 348 }, { "epoch": 0.09605724901947292, "grad_norm": 0.5291352192477727, "learning_rate": 3.926162155005024e-05, "loss": 0.8695, "step": 349 }, { "epoch": 0.09633248469001583, "grad_norm": 0.435108769302938, "learning_rate": 3.925691034626997e-05, "loss": 0.8927, "step": 350 }, { "epoch": 0.09660772036055873, "grad_norm": 0.4625701183252576, "learning_rate": 3.925218444487154e-05, "loss": 0.9128, "step": 351 }, { "epoch": 0.09688295603110163, "grad_norm": 0.5561823027644259, "learning_rate": 3.924744384946195e-05, "loss": 0.8551, "step": 352 }, { "epoch": 0.09715819170164454, "grad_norm": 0.6897758949035243, "learning_rate": 3.9242688563659406e-05, "loss": 0.8996, "step": 353 }, { "epoch": 0.09743342737218744, "grad_norm": 0.518046280528702, "learning_rate": 3.923791859109332e-05, "loss": 0.8713, "step": 354 }, { "epoch": 0.09770866304273033, "grad_norm": 0.6141786904296741, "learning_rate": 3.923313393540433e-05, "loss": 0.9132, "step": 355 }, { "epoch": 0.09798389871327325, "grad_norm": 0.48284229958533237, "learning_rate": 3.922833460024425e-05, "loss": 0.9018, "step": 356 }, { "epoch": 0.09825913438381614, "grad_norm": 0.4409973295054774, "learning_rate": 3.922352058927614e-05, "loss": 0.8537, "step": 357 }, { "epoch": 0.09853437005435904, "grad_norm": 0.4890342415470555, "learning_rate": 3.921869190617423e-05, "loss": 0.881, "step": 358 }, { "epoch": 0.09880960572490195, "grad_norm": 0.5005438507702201, "learning_rate": 3.921384855462396e-05, "loss": 0.8769, "step": 359 }, { "epoch": 0.09908484139544485, "grad_norm": 0.47314532654978314, "learning_rate": 3.920899053832195e-05, "loss": 0.8736, "step": 360 }, { "epoch": 0.09936007706598775, "grad_norm": 0.5428084046874413, "learning_rate": 3.920411786097605e-05, "loss": 0.8566, "step": 361 }, { "epoch": 0.09963531273653066, "grad_norm": 0.6599627349997179, "learning_rate": 3.919923052630526e-05, "loss": 0.8874, "step": 362 }, { "epoch": 0.09991054840707356, "grad_norm": 0.6702735192730017, "learning_rate": 3.9194328538039775e-05, "loss": 0.9135, "step": 363 }, { "epoch": 0.10018578407761645, "grad_norm": 0.6260073822159783, "learning_rate": 3.9189411899921e-05, "loss": 0.8642, "step": 364 }, { "epoch": 0.10046101974815937, "grad_norm": 0.6037436224945719, "learning_rate": 3.9184480615701496e-05, "loss": 0.898, "step": 365 }, { "epoch": 0.10073625541870226, "grad_norm": 0.553940568521368, "learning_rate": 3.917953468914501e-05, "loss": 0.8849, "step": 366 }, { "epoch": 0.10101149108924516, "grad_norm": 0.5691010429228973, "learning_rate": 3.917457412402645e-05, "loss": 0.8892, "step": 367 }, { "epoch": 0.10128672675978807, "grad_norm": 0.5059542629361516, "learning_rate": 3.916959892413194e-05, "loss": 0.9121, "step": 368 }, { "epoch": 0.10156196243033097, "grad_norm": 0.48598338363082094, "learning_rate": 3.9164609093258726e-05, "loss": 0.8686, "step": 369 }, { "epoch": 0.10183719810087387, "grad_norm": 0.582121236237182, "learning_rate": 3.9159604635215236e-05, "loss": 0.8563, "step": 370 }, { "epoch": 0.10211243377141678, "grad_norm": 0.6162442781859768, "learning_rate": 3.915458555382108e-05, "loss": 0.8713, "step": 371 }, { "epoch": 0.10238766944195968, "grad_norm": 0.6059270830911037, "learning_rate": 3.9149551852907e-05, "loss": 0.8955, "step": 372 }, { "epoch": 0.10266290511250258, "grad_norm": 0.5495886266547979, "learning_rate": 3.914450353631492e-05, "loss": 0.9098, "step": 373 }, { "epoch": 0.10293814078304549, "grad_norm": 0.7435991277339683, "learning_rate": 3.913944060789791e-05, "loss": 0.9084, "step": 374 }, { "epoch": 0.10321337645358838, "grad_norm": 0.5412944053150593, "learning_rate": 3.91343630715202e-05, "loss": 0.8736, "step": 375 }, { "epoch": 0.10348861212413128, "grad_norm": 0.5300055966983039, "learning_rate": 3.912927093105714e-05, "loss": 0.8706, "step": 376 }, { "epoch": 0.1037638477946742, "grad_norm": 0.5085340455305501, "learning_rate": 3.912416419039526e-05, "loss": 0.8844, "step": 377 }, { "epoch": 0.10403908346521709, "grad_norm": 0.5314131577090296, "learning_rate": 3.911904285343224e-05, "loss": 0.8811, "step": 378 }, { "epoch": 0.10431431913575999, "grad_norm": 0.5743431648299027, "learning_rate": 3.911390692407685e-05, "loss": 0.8823, "step": 379 }, { "epoch": 0.1045895548063029, "grad_norm": 0.5575204437188148, "learning_rate": 3.910875640624905e-05, "loss": 0.8732, "step": 380 }, { "epoch": 0.1048647904768458, "grad_norm": 0.6500868814849747, "learning_rate": 3.910359130387991e-05, "loss": 0.8587, "step": 381 }, { "epoch": 0.1051400261473887, "grad_norm": 0.5906306776112186, "learning_rate": 3.909841162091164e-05, "loss": 0.9026, "step": 382 }, { "epoch": 0.10541526181793161, "grad_norm": 0.6266630869849823, "learning_rate": 3.909321736129757e-05, "loss": 0.8938, "step": 383 }, { "epoch": 0.1056904974884745, "grad_norm": 0.6718226466340169, "learning_rate": 3.908800852900215e-05, "loss": 0.8786, "step": 384 }, { "epoch": 0.1059657331590174, "grad_norm": 0.6634999849790058, "learning_rate": 3.908278512800098e-05, "loss": 0.8885, "step": 385 }, { "epoch": 0.10624096882956031, "grad_norm": 0.6431127170000246, "learning_rate": 3.9077547162280754e-05, "loss": 0.8749, "step": 386 }, { "epoch": 0.10651620450010321, "grad_norm": 0.5710373048152448, "learning_rate": 3.907229463583928e-05, "loss": 0.8723, "step": 387 }, { "epoch": 0.10679144017064611, "grad_norm": 0.5416823760384776, "learning_rate": 3.9067027552685506e-05, "loss": 0.8954, "step": 388 }, { "epoch": 0.10706667584118902, "grad_norm": 0.4573907707132969, "learning_rate": 3.906174591683946e-05, "loss": 0.8981, "step": 389 }, { "epoch": 0.10734191151173192, "grad_norm": 0.5611673580990199, "learning_rate": 3.90564497323323e-05, "loss": 0.9131, "step": 390 }, { "epoch": 0.10761714718227482, "grad_norm": 0.6365557897336063, "learning_rate": 3.905113900320627e-05, "loss": 0.895, "step": 391 }, { "epoch": 0.10789238285281773, "grad_norm": 0.7015761098854781, "learning_rate": 3.904581373351474e-05, "loss": 0.8965, "step": 392 }, { "epoch": 0.10816761852336063, "grad_norm": 0.637562535977938, "learning_rate": 3.9040473927322136e-05, "loss": 0.8802, "step": 393 }, { "epoch": 0.10844285419390352, "grad_norm": 0.6240363416564655, "learning_rate": 3.9035119588704026e-05, "loss": 0.9175, "step": 394 }, { "epoch": 0.10871808986444643, "grad_norm": 0.5664443709947848, "learning_rate": 3.902975072174704e-05, "loss": 0.8742, "step": 395 }, { "epoch": 0.10899332553498933, "grad_norm": 0.5507530290972074, "learning_rate": 3.9024367330548904e-05, "loss": 0.8716, "step": 396 }, { "epoch": 0.10926856120553223, "grad_norm": 0.631821090156823, "learning_rate": 3.901896941921843e-05, "loss": 0.901, "step": 397 }, { "epoch": 0.10954379687607514, "grad_norm": 0.6293458415988438, "learning_rate": 3.9013556991875515e-05, "loss": 0.8666, "step": 398 }, { "epoch": 0.10981903254661804, "grad_norm": 0.5983356080974974, "learning_rate": 3.900813005265113e-05, "loss": 0.8703, "step": 399 }, { "epoch": 0.11009426821716094, "grad_norm": 0.4978346446457818, "learning_rate": 3.9002688605687334e-05, "loss": 0.8923, "step": 400 }, { "epoch": 0.11036950388770385, "grad_norm": 0.48512440368202475, "learning_rate": 3.8997232655137234e-05, "loss": 0.8714, "step": 401 }, { "epoch": 0.11064473955824675, "grad_norm": 0.517562020241431, "learning_rate": 3.899176220516504e-05, "loss": 0.8678, "step": 402 }, { "epoch": 0.11091997522878966, "grad_norm": 0.5717948870585363, "learning_rate": 3.8986277259945996e-05, "loss": 0.8691, "step": 403 }, { "epoch": 0.11119521089933256, "grad_norm": 0.4991076776447727, "learning_rate": 3.898077782366643e-05, "loss": 0.874, "step": 404 }, { "epoch": 0.11147044656987545, "grad_norm": 0.4855826990964506, "learning_rate": 3.897526390052372e-05, "loss": 0.8593, "step": 405 }, { "epoch": 0.11174568224041836, "grad_norm": 0.4874229602866942, "learning_rate": 3.8969735494726306e-05, "loss": 0.8838, "step": 406 }, { "epoch": 0.11202091791096126, "grad_norm": 0.5330378392189458, "learning_rate": 3.896419261049369e-05, "loss": 0.8427, "step": 407 }, { "epoch": 0.11229615358150416, "grad_norm": 0.575968875495886, "learning_rate": 3.8958635252056404e-05, "loss": 0.8692, "step": 408 }, { "epoch": 0.11257138925204707, "grad_norm": 0.4801689204644928, "learning_rate": 3.8953063423656055e-05, "loss": 0.892, "step": 409 }, { "epoch": 0.11284662492258997, "grad_norm": 0.48737892640435837, "learning_rate": 3.8947477129545256e-05, "loss": 0.8883, "step": 410 }, { "epoch": 0.11312186059313287, "grad_norm": 0.5244170455238508, "learning_rate": 3.89418763739877e-05, "loss": 0.8641, "step": 411 }, { "epoch": 0.11339709626367578, "grad_norm": 0.5375336765948339, "learning_rate": 3.8936261161258094e-05, "loss": 0.879, "step": 412 }, { "epoch": 0.11367233193421868, "grad_norm": 0.5194538613237015, "learning_rate": 3.893063149564218e-05, "loss": 0.8546, "step": 413 }, { "epoch": 0.11394756760476157, "grad_norm": 0.5520513158148513, "learning_rate": 3.8924987381436746e-05, "loss": 0.8748, "step": 414 }, { "epoch": 0.11422280327530449, "grad_norm": 0.6132063585013063, "learning_rate": 3.8919328822949587e-05, "loss": 0.8525, "step": 415 }, { "epoch": 0.11449803894584738, "grad_norm": 0.6751742930556689, "learning_rate": 3.8913655824499536e-05, "loss": 0.8704, "step": 416 }, { "epoch": 0.11477327461639028, "grad_norm": 0.7321795805175227, "learning_rate": 3.890796839041646e-05, "loss": 0.8755, "step": 417 }, { "epoch": 0.11504851028693319, "grad_norm": 0.5708036927403648, "learning_rate": 3.890226652504121e-05, "loss": 0.8703, "step": 418 }, { "epoch": 0.11532374595747609, "grad_norm": 0.504748003198664, "learning_rate": 3.889655023272568e-05, "loss": 0.8596, "step": 419 }, { "epoch": 0.11559898162801899, "grad_norm": 0.5900956763318453, "learning_rate": 3.889081951783276e-05, "loss": 0.9089, "step": 420 }, { "epoch": 0.1158742172985619, "grad_norm": 0.6466905218632802, "learning_rate": 3.888507438473636e-05, "loss": 0.8628, "step": 421 }, { "epoch": 0.1161494529691048, "grad_norm": 0.6078563855062546, "learning_rate": 3.887931483782137e-05, "loss": 0.9246, "step": 422 }, { "epoch": 0.1164246886396477, "grad_norm": 0.49866059364732357, "learning_rate": 3.8873540881483725e-05, "loss": 0.8576, "step": 423 }, { "epoch": 0.1166999243101906, "grad_norm": 0.5124859820552345, "learning_rate": 3.8867752520130315e-05, "loss": 0.8908, "step": 424 }, { "epoch": 0.1169751599807335, "grad_norm": 0.5627280720731888, "learning_rate": 3.8861949758179044e-05, "loss": 0.8969, "step": 425 }, { "epoch": 0.1172503956512764, "grad_norm": 0.5759560120018811, "learning_rate": 3.88561326000588e-05, "loss": 0.8467, "step": 426 }, { "epoch": 0.11752563132181931, "grad_norm": 0.5862534370058109, "learning_rate": 3.8850301050209476e-05, "loss": 0.9076, "step": 427 }, { "epoch": 0.11780086699236221, "grad_norm": 0.5567090118087943, "learning_rate": 3.8844455113081915e-05, "loss": 0.8969, "step": 428 }, { "epoch": 0.11807610266290511, "grad_norm": 0.5325378918415913, "learning_rate": 3.883859479313798e-05, "loss": 0.8923, "step": 429 }, { "epoch": 0.11835133833344802, "grad_norm": 0.5272783210005675, "learning_rate": 3.883272009485049e-05, "loss": 0.8667, "step": 430 }, { "epoch": 0.11862657400399092, "grad_norm": 0.5457322963275876, "learning_rate": 3.8826831022703245e-05, "loss": 0.8551, "step": 431 }, { "epoch": 0.11890180967453381, "grad_norm": 0.5741588144845013, "learning_rate": 3.882092758119099e-05, "loss": 0.8421, "step": 432 }, { "epoch": 0.11917704534507673, "grad_norm": 0.4836464637866908, "learning_rate": 3.88150097748195e-05, "loss": 0.8777, "step": 433 }, { "epoch": 0.11945228101561962, "grad_norm": 0.5898419315572756, "learning_rate": 3.8809077608105435e-05, "loss": 0.8443, "step": 434 }, { "epoch": 0.11972751668616252, "grad_norm": 0.5942615878371786, "learning_rate": 3.8803131085576477e-05, "loss": 0.8509, "step": 435 }, { "epoch": 0.12000275235670543, "grad_norm": 0.5024995991629244, "learning_rate": 3.879717021177123e-05, "loss": 0.9012, "step": 436 }, { "epoch": 0.12027798802724833, "grad_norm": 0.49176826477914476, "learning_rate": 3.879119499123927e-05, "loss": 0.9095, "step": 437 }, { "epoch": 0.12055322369779123, "grad_norm": 0.4997512671977748, "learning_rate": 3.878520542854111e-05, "loss": 0.8522, "step": 438 }, { "epoch": 0.12082845936833414, "grad_norm": 0.5174557816004738, "learning_rate": 3.877920152824822e-05, "loss": 0.8709, "step": 439 }, { "epoch": 0.12110369503887704, "grad_norm": 0.5462519993585703, "learning_rate": 3.8773183294943015e-05, "loss": 0.8558, "step": 440 }, { "epoch": 0.12137893070941994, "grad_norm": 0.5669830253709262, "learning_rate": 3.876715073321883e-05, "loss": 0.8589, "step": 441 }, { "epoch": 0.12165416637996285, "grad_norm": 0.5559979293663662, "learning_rate": 3.876110384767996e-05, "loss": 0.8666, "step": 442 }, { "epoch": 0.12192940205050574, "grad_norm": 0.5352320132499331, "learning_rate": 3.875504264294161e-05, "loss": 0.8658, "step": 443 }, { "epoch": 0.12220463772104864, "grad_norm": 0.5097852955806261, "learning_rate": 3.874896712362994e-05, "loss": 0.8923, "step": 444 }, { "epoch": 0.12247987339159155, "grad_norm": 0.5679336085640061, "learning_rate": 3.874287729438201e-05, "loss": 0.8747, "step": 445 }, { "epoch": 0.12275510906213445, "grad_norm": 0.5717500090072024, "learning_rate": 3.873677315984582e-05, "loss": 0.9141, "step": 446 }, { "epoch": 0.12303034473267735, "grad_norm": 0.5351395879024524, "learning_rate": 3.8730654724680284e-05, "loss": 0.887, "step": 447 }, { "epoch": 0.12330558040322026, "grad_norm": 0.5056236978853025, "learning_rate": 3.8724521993555216e-05, "loss": 0.8712, "step": 448 }, { "epoch": 0.12358081607376316, "grad_norm": 0.445389896216137, "learning_rate": 3.8718374971151356e-05, "loss": 0.8856, "step": 449 }, { "epoch": 0.12385605174430606, "grad_norm": 0.4979986798823173, "learning_rate": 3.871221366216036e-05, "loss": 0.884, "step": 450 }, { "epoch": 0.12413128741484897, "grad_norm": 0.5098373282634219, "learning_rate": 3.870603807128477e-05, "loss": 0.8824, "step": 451 }, { "epoch": 0.12440652308539187, "grad_norm": 0.4884571444922137, "learning_rate": 3.869984820323804e-05, "loss": 0.866, "step": 452 }, { "epoch": 0.12468175875593476, "grad_norm": 0.48319386031254, "learning_rate": 3.86936440627445e-05, "loss": 0.8622, "step": 453 }, { "epoch": 0.12495699442647767, "grad_norm": 0.4875346343493672, "learning_rate": 3.868742565453941e-05, "loss": 0.9008, "step": 454 }, { "epoch": 0.12523223009702059, "grad_norm": 0.44164375087009367, "learning_rate": 3.868119298336889e-05, "loss": 0.865, "step": 455 }, { "epoch": 0.12550746576756347, "grad_norm": 0.5185602016440723, "learning_rate": 3.867494605398996e-05, "loss": 0.8768, "step": 456 }, { "epoch": 0.12578270143810638, "grad_norm": 0.537626827141255, "learning_rate": 3.8668684871170514e-05, "loss": 0.8512, "step": 457 }, { "epoch": 0.1260579371086493, "grad_norm": 0.4693844997367465, "learning_rate": 3.866240943968932e-05, "loss": 0.8425, "step": 458 }, { "epoch": 0.12633317277919218, "grad_norm": 0.531515674970887, "learning_rate": 3.865611976433605e-05, "loss": 0.8819, "step": 459 }, { "epoch": 0.1266084084497351, "grad_norm": 0.5745310526190663, "learning_rate": 3.864981584991122e-05, "loss": 0.8788, "step": 460 }, { "epoch": 0.126883644120278, "grad_norm": 0.5770227255769966, "learning_rate": 3.864349770122621e-05, "loss": 0.8797, "step": 461 }, { "epoch": 0.12715887979082088, "grad_norm": 0.5103586863720179, "learning_rate": 3.863716532310329e-05, "loss": 0.9062, "step": 462 }, { "epoch": 0.1274341154613638, "grad_norm": 0.4781784565411039, "learning_rate": 3.863081872037557e-05, "loss": 0.8687, "step": 463 }, { "epoch": 0.1277093511319067, "grad_norm": 0.4317409857087674, "learning_rate": 3.862445789788701e-05, "loss": 0.9079, "step": 464 }, { "epoch": 0.1279845868024496, "grad_norm": 0.44915572043977114, "learning_rate": 3.8618082860492456e-05, "loss": 0.8738, "step": 465 }, { "epoch": 0.1282598224729925, "grad_norm": 0.6805523727920348, "learning_rate": 3.861169361305757e-05, "loss": 0.8607, "step": 466 }, { "epoch": 0.1285350581435354, "grad_norm": 0.51228196439054, "learning_rate": 3.860529016045888e-05, "loss": 0.8927, "step": 467 }, { "epoch": 0.1288102938140783, "grad_norm": 0.672172331204869, "learning_rate": 3.859887250758374e-05, "loss": 0.847, "step": 468 }, { "epoch": 0.1290855294846212, "grad_norm": 0.6193259834184921, "learning_rate": 3.8592440659330354e-05, "loss": 0.8587, "step": 469 }, { "epoch": 0.12936076515516412, "grad_norm": 0.5338838370354034, "learning_rate": 3.858599462060776e-05, "loss": 0.8661, "step": 470 }, { "epoch": 0.129636000825707, "grad_norm": 0.4871517328100876, "learning_rate": 3.8579534396335835e-05, "loss": 0.8719, "step": 471 }, { "epoch": 0.12991123649624992, "grad_norm": 0.5398484469959097, "learning_rate": 3.857305999144525e-05, "loss": 0.8482, "step": 472 }, { "epoch": 0.13018647216679283, "grad_norm": 1.0737124169159604, "learning_rate": 3.856657141087753e-05, "loss": 0.877, "step": 473 }, { "epoch": 0.1304617078373357, "grad_norm": 0.5712118687368332, "learning_rate": 3.8560068659585006e-05, "loss": 0.9126, "step": 474 }, { "epoch": 0.13073694350787862, "grad_norm": 0.5299285938372721, "learning_rate": 3.855355174253084e-05, "loss": 0.8648, "step": 475 }, { "epoch": 0.13101217917842153, "grad_norm": 0.5832496967442821, "learning_rate": 3.854702066468899e-05, "loss": 0.8767, "step": 476 }, { "epoch": 0.13128741484896442, "grad_norm": 0.5768216753062566, "learning_rate": 3.8540475431044224e-05, "loss": 0.8955, "step": 477 }, { "epoch": 0.13156265051950733, "grad_norm": 0.5371945157499757, "learning_rate": 3.8533916046592115e-05, "loss": 0.8397, "step": 478 }, { "epoch": 0.13183788619005024, "grad_norm": 0.5555649404302065, "learning_rate": 3.852734251633905e-05, "loss": 0.8653, "step": 479 }, { "epoch": 0.13211312186059312, "grad_norm": 0.5379325982134389, "learning_rate": 3.852075484530219e-05, "loss": 0.8407, "step": 480 }, { "epoch": 0.13238835753113604, "grad_norm": 0.6364764043277225, "learning_rate": 3.85141530385095e-05, "loss": 0.8481, "step": 481 }, { "epoch": 0.13266359320167895, "grad_norm": 0.6346413463658178, "learning_rate": 3.8507537100999746e-05, "loss": 0.8597, "step": 482 }, { "epoch": 0.13293882887222183, "grad_norm": 0.5880283792103489, "learning_rate": 3.850090703782246e-05, "loss": 0.8712, "step": 483 }, { "epoch": 0.13321406454276474, "grad_norm": 0.5177115144515265, "learning_rate": 3.8494262854037955e-05, "loss": 0.8448, "step": 484 }, { "epoch": 0.13348930021330765, "grad_norm": 0.4980145821933916, "learning_rate": 3.848760455471734e-05, "loss": 0.9094, "step": 485 }, { "epoch": 0.13376453588385054, "grad_norm": 0.547624989774016, "learning_rate": 3.848093214494248e-05, "loss": 0.8744, "step": 486 }, { "epoch": 0.13403977155439345, "grad_norm": 0.618583275402833, "learning_rate": 3.847424562980602e-05, "loss": 0.8576, "step": 487 }, { "epoch": 0.13431500722493636, "grad_norm": 0.4766724787140889, "learning_rate": 3.8467545014411365e-05, "loss": 0.8627, "step": 488 }, { "epoch": 0.13459024289547925, "grad_norm": 0.49314930291044035, "learning_rate": 3.846083030387268e-05, "loss": 0.8773, "step": 489 }, { "epoch": 0.13486547856602216, "grad_norm": 0.5726859587483527, "learning_rate": 3.8454101503314896e-05, "loss": 0.8688, "step": 490 }, { "epoch": 0.13514071423656507, "grad_norm": 0.5209229799023615, "learning_rate": 3.84473586178737e-05, "loss": 0.8446, "step": 491 }, { "epoch": 0.13541594990710795, "grad_norm": 0.5380188644630678, "learning_rate": 3.8440601652695504e-05, "loss": 0.8615, "step": 492 }, { "epoch": 0.13569118557765086, "grad_norm": 0.5368345332836999, "learning_rate": 3.84338306129375e-05, "loss": 0.872, "step": 493 }, { "epoch": 0.13596642124819378, "grad_norm": 0.4941502507243993, "learning_rate": 3.842704550376761e-05, "loss": 0.8813, "step": 494 }, { "epoch": 0.13624165691873666, "grad_norm": 0.510201925526349, "learning_rate": 3.842024633036448e-05, "loss": 0.8516, "step": 495 }, { "epoch": 0.13651689258927957, "grad_norm": 0.5859285227055108, "learning_rate": 3.841343309791751e-05, "loss": 0.8465, "step": 496 }, { "epoch": 0.13679212825982248, "grad_norm": 0.5051374546024748, "learning_rate": 3.8406605811626814e-05, "loss": 0.8764, "step": 497 }, { "epoch": 0.13706736393036537, "grad_norm": 0.4777860450594652, "learning_rate": 3.8399764476703244e-05, "loss": 0.8865, "step": 498 }, { "epoch": 0.13734259960090828, "grad_norm": 0.4648482392371114, "learning_rate": 3.8392909098368377e-05, "loss": 0.8696, "step": 499 }, { "epoch": 0.1376178352714512, "grad_norm": 0.39194934141636306, "learning_rate": 3.8386039681854504e-05, "loss": 0.8735, "step": 500 }, { "epoch": 0.13789307094199407, "grad_norm": 0.48648359915890216, "learning_rate": 3.837915623240462e-05, "loss": 0.8688, "step": 501 }, { "epoch": 0.13816830661253698, "grad_norm": 0.5328871694309647, "learning_rate": 3.837225875527244e-05, "loss": 0.8696, "step": 502 }, { "epoch": 0.1384435422830799, "grad_norm": 0.46675333860858087, "learning_rate": 3.8365347255722396e-05, "loss": 0.8423, "step": 503 }, { "epoch": 0.13871877795362278, "grad_norm": 0.473226686282098, "learning_rate": 3.835842173902959e-05, "loss": 0.8478, "step": 504 }, { "epoch": 0.1389940136241657, "grad_norm": 0.43029063072170615, "learning_rate": 3.835148221047988e-05, "loss": 0.8599, "step": 505 }, { "epoch": 0.1392692492947086, "grad_norm": 0.3952610348334728, "learning_rate": 3.834452867536974e-05, "loss": 0.8493, "step": 506 }, { "epoch": 0.1395444849652515, "grad_norm": 0.46255450767863043, "learning_rate": 3.8337561139006405e-05, "loss": 0.8435, "step": 507 }, { "epoch": 0.1398197206357944, "grad_norm": 0.5907346043765216, "learning_rate": 3.833057960670776e-05, "loss": 0.867, "step": 508 }, { "epoch": 0.1400949563063373, "grad_norm": 0.41416936627426143, "learning_rate": 3.832358408380239e-05, "loss": 0.8642, "step": 509 }, { "epoch": 0.1403701919768802, "grad_norm": 0.3578028940778584, "learning_rate": 3.8316574575629524e-05, "loss": 0.8859, "step": 510 }, { "epoch": 0.1406454276474231, "grad_norm": 0.4532996072863284, "learning_rate": 3.8309551087539116e-05, "loss": 0.8808, "step": 511 }, { "epoch": 0.14092066331796602, "grad_norm": 0.40684550825360394, "learning_rate": 3.8302513624891743e-05, "loss": 0.8676, "step": 512 }, { "epoch": 0.1411958989885089, "grad_norm": 0.40300225878763124, "learning_rate": 3.8295462193058686e-05, "loss": 0.8376, "step": 513 }, { "epoch": 0.1414711346590518, "grad_norm": 0.39067970235574856, "learning_rate": 3.8288396797421855e-05, "loss": 0.8937, "step": 514 }, { "epoch": 0.14174637032959472, "grad_norm": 0.5695505575761707, "learning_rate": 3.828131744337384e-05, "loss": 0.8645, "step": 515 }, { "epoch": 0.1420216060001376, "grad_norm": 0.4526706615787753, "learning_rate": 3.8274224136317884e-05, "loss": 0.8576, "step": 516 }, { "epoch": 0.14229684167068052, "grad_norm": 0.4287103017952185, "learning_rate": 3.8267116881667855e-05, "loss": 0.8805, "step": 517 }, { "epoch": 0.14257207734122343, "grad_norm": 0.48674747019729414, "learning_rate": 3.8259995684848306e-05, "loss": 0.8482, "step": 518 }, { "epoch": 0.1428473130117663, "grad_norm": 0.49325687985959527, "learning_rate": 3.82528605512944e-05, "loss": 0.8804, "step": 519 }, { "epoch": 0.14312254868230923, "grad_norm": 0.49645020641649507, "learning_rate": 3.824571148645194e-05, "loss": 0.8835, "step": 520 }, { "epoch": 0.14339778435285214, "grad_norm": 0.4717842701616438, "learning_rate": 3.823854849577738e-05, "loss": 0.8808, "step": 521 }, { "epoch": 0.14367302002339502, "grad_norm": 0.5017642201403522, "learning_rate": 3.823137158473778e-05, "loss": 0.8738, "step": 522 }, { "epoch": 0.14394825569393793, "grad_norm": 0.46900408870612853, "learning_rate": 3.8224180758810845e-05, "loss": 0.8466, "step": 523 }, { "epoch": 0.14422349136448084, "grad_norm": 0.4649485101983954, "learning_rate": 3.821697602348489e-05, "loss": 0.8637, "step": 524 }, { "epoch": 0.14449872703502373, "grad_norm": 0.7670893359269847, "learning_rate": 3.820975738425884e-05, "loss": 0.8791, "step": 525 }, { "epoch": 0.14477396270556664, "grad_norm": 0.4634061219146758, "learning_rate": 3.8202524846642246e-05, "loss": 0.8598, "step": 526 }, { "epoch": 0.14504919837610955, "grad_norm": 0.4562083751894467, "learning_rate": 3.8195278416155266e-05, "loss": 0.8457, "step": 527 }, { "epoch": 0.14532443404665243, "grad_norm": 0.43986677245180233, "learning_rate": 3.8188018098328636e-05, "loss": 0.8606, "step": 528 }, { "epoch": 0.14559966971719535, "grad_norm": 0.45943787267139574, "learning_rate": 3.8180743898703735e-05, "loss": 0.844, "step": 529 }, { "epoch": 0.14587490538773826, "grad_norm": 0.4868762429617816, "learning_rate": 3.81734558228325e-05, "loss": 0.8649, "step": 530 }, { "epoch": 0.14615014105828114, "grad_norm": 0.5229564492816823, "learning_rate": 3.816615387627748e-05, "loss": 0.8808, "step": 531 }, { "epoch": 0.14642537672882405, "grad_norm": 0.47491592303452623, "learning_rate": 3.8158838064611784e-05, "loss": 0.8836, "step": 532 }, { "epoch": 0.14670061239936696, "grad_norm": 0.4892872843931489, "learning_rate": 3.815150839341915e-05, "loss": 0.8501, "step": 533 }, { "epoch": 0.14697584806990985, "grad_norm": 0.5067111165431507, "learning_rate": 3.814416486829384e-05, "loss": 0.8787, "step": 534 }, { "epoch": 0.14725108374045276, "grad_norm": 0.5847903189477168, "learning_rate": 3.813680749484073e-05, "loss": 0.8862, "step": 535 }, { "epoch": 0.14752631941099567, "grad_norm": 0.5307891953867472, "learning_rate": 3.812943627867525e-05, "loss": 0.8447, "step": 536 }, { "epoch": 0.14780155508153855, "grad_norm": 0.43437401733213127, "learning_rate": 3.81220512254234e-05, "loss": 0.8755, "step": 537 }, { "epoch": 0.14807679075208147, "grad_norm": 0.4498193054640528, "learning_rate": 3.811465234072173e-05, "loss": 0.863, "step": 538 }, { "epoch": 0.14835202642262438, "grad_norm": 0.510868396643692, "learning_rate": 3.810723963021737e-05, "loss": 0.8801, "step": 539 }, { "epoch": 0.14862726209316726, "grad_norm": 0.4608288472149214, "learning_rate": 3.8099813099567964e-05, "loss": 0.8661, "step": 540 }, { "epoch": 0.14890249776371017, "grad_norm": 0.4717856489674427, "learning_rate": 3.809237275444174e-05, "loss": 0.8366, "step": 541 }, { "epoch": 0.14917773343425308, "grad_norm": 0.5434506620844847, "learning_rate": 3.808491860051747e-05, "loss": 0.8596, "step": 542 }, { "epoch": 0.14945296910479597, "grad_norm": 0.5381320292887712, "learning_rate": 3.8077450643484424e-05, "loss": 0.8555, "step": 543 }, { "epoch": 0.14972820477533888, "grad_norm": 0.5143549118332782, "learning_rate": 3.806996888904245e-05, "loss": 0.8644, "step": 544 }, { "epoch": 0.1500034404458818, "grad_norm": 0.7267877355499845, "learning_rate": 3.8062473342901925e-05, "loss": 0.8616, "step": 545 }, { "epoch": 0.15027867611642468, "grad_norm": 0.41183292653113557, "learning_rate": 3.805496401078372e-05, "loss": 0.8667, "step": 546 }, { "epoch": 0.1505539117869676, "grad_norm": 0.5024814370294475, "learning_rate": 3.804744089841926e-05, "loss": 0.8914, "step": 547 }, { "epoch": 0.1508291474575105, "grad_norm": 0.5942440551592292, "learning_rate": 3.803990401155046e-05, "loss": 0.8633, "step": 548 }, { "epoch": 0.15110438312805338, "grad_norm": 0.5566367549335995, "learning_rate": 3.8032353355929773e-05, "loss": 0.8756, "step": 549 }, { "epoch": 0.1513796187985963, "grad_norm": 0.49409579251287367, "learning_rate": 3.802478893732016e-05, "loss": 0.8683, "step": 550 }, { "epoch": 0.1516548544691392, "grad_norm": 0.3725949960956119, "learning_rate": 3.801721076149506e-05, "loss": 0.8706, "step": 551 }, { "epoch": 0.1519300901396821, "grad_norm": 0.4626308585605636, "learning_rate": 3.8009618834238445e-05, "loss": 0.8505, "step": 552 }, { "epoch": 0.152205325810225, "grad_norm": 0.5238016550980202, "learning_rate": 3.8002013161344755e-05, "loss": 0.864, "step": 553 }, { "epoch": 0.1524805614807679, "grad_norm": 0.5662212591295838, "learning_rate": 3.7994393748618945e-05, "loss": 0.8404, "step": 554 }, { "epoch": 0.15275579715131082, "grad_norm": 0.5457119329512516, "learning_rate": 3.798676060187644e-05, "loss": 0.8617, "step": 555 }, { "epoch": 0.1530310328218537, "grad_norm": 0.49122969359545865, "learning_rate": 3.797911372694314e-05, "loss": 0.8658, "step": 556 }, { "epoch": 0.15330626849239662, "grad_norm": 0.4129007932799576, "learning_rate": 3.797145312965546e-05, "loss": 0.8635, "step": 557 }, { "epoch": 0.15358150416293953, "grad_norm": 0.45508532583757116, "learning_rate": 3.796377881586025e-05, "loss": 0.8575, "step": 558 }, { "epoch": 0.15385673983348241, "grad_norm": 0.5207853672493407, "learning_rate": 3.795609079141484e-05, "loss": 0.8626, "step": 559 }, { "epoch": 0.15413197550402533, "grad_norm": 0.581890744677401, "learning_rate": 3.7948389062187025e-05, "loss": 0.8693, "step": 560 }, { "epoch": 0.15440721117456824, "grad_norm": 0.5883911873807134, "learning_rate": 3.794067363405508e-05, "loss": 0.846, "step": 561 }, { "epoch": 0.15468244684511112, "grad_norm": 0.5207574905100074, "learning_rate": 3.79329445129077e-05, "loss": 0.8247, "step": 562 }, { "epoch": 0.15495768251565403, "grad_norm": 0.4523953811760909, "learning_rate": 3.792520170464406e-05, "loss": 0.8442, "step": 563 }, { "epoch": 0.15523291818619694, "grad_norm": 0.49497191299981996, "learning_rate": 3.7917445215173765e-05, "loss": 0.8572, "step": 564 }, { "epoch": 0.15550815385673983, "grad_norm": 0.5961479001151687, "learning_rate": 3.7909675050416864e-05, "loss": 0.8504, "step": 565 }, { "epoch": 0.15578338952728274, "grad_norm": 0.5553065203732548, "learning_rate": 3.7901891216303855e-05, "loss": 0.8497, "step": 566 }, { "epoch": 0.15605862519782565, "grad_norm": 0.4887797088139101, "learning_rate": 3.789409371877566e-05, "loss": 0.8654, "step": 567 }, { "epoch": 0.15633386086836853, "grad_norm": 0.4507456310067676, "learning_rate": 3.7886282563783626e-05, "loss": 0.8922, "step": 568 }, { "epoch": 0.15660909653891145, "grad_norm": 0.534811993235575, "learning_rate": 3.787845775728953e-05, "loss": 0.8766, "step": 569 }, { "epoch": 0.15688433220945436, "grad_norm": 0.5537662622733155, "learning_rate": 3.7870619305265566e-05, "loss": 0.8625, "step": 570 }, { "epoch": 0.15715956787999724, "grad_norm": 0.4727809559456603, "learning_rate": 3.7862767213694347e-05, "loss": 0.8461, "step": 571 }, { "epoch": 0.15743480355054015, "grad_norm": 0.4418209320709128, "learning_rate": 3.785490148856889e-05, "loss": 0.8553, "step": 572 }, { "epoch": 0.15771003922108306, "grad_norm": 0.47389940708387823, "learning_rate": 3.784702213589262e-05, "loss": 0.854, "step": 573 }, { "epoch": 0.15798527489162595, "grad_norm": 0.518179401198152, "learning_rate": 3.7839129161679366e-05, "loss": 0.8552, "step": 574 }, { "epoch": 0.15826051056216886, "grad_norm": 0.4639622479114093, "learning_rate": 3.7831222571953344e-05, "loss": 0.8715, "step": 575 }, { "epoch": 0.15853574623271177, "grad_norm": 0.4323743239097041, "learning_rate": 3.782330237274918e-05, "loss": 0.8451, "step": 576 }, { "epoch": 0.15881098190325466, "grad_norm": 0.41998341728671285, "learning_rate": 3.7815368570111866e-05, "loss": 0.8561, "step": 577 }, { "epoch": 0.15908621757379757, "grad_norm": 0.37329638579985064, "learning_rate": 3.780742117009679e-05, "loss": 0.8597, "step": 578 }, { "epoch": 0.15936145324434048, "grad_norm": 0.3992371734908037, "learning_rate": 3.779946017876972e-05, "loss": 0.8547, "step": 579 }, { "epoch": 0.15963668891488336, "grad_norm": 0.46504937163189863, "learning_rate": 3.7791485602206786e-05, "loss": 0.8815, "step": 580 }, { "epoch": 0.15991192458542627, "grad_norm": 0.44217107149660534, "learning_rate": 3.778349744649449e-05, "loss": 0.8611, "step": 581 }, { "epoch": 0.16018716025596919, "grad_norm": 0.353586171048602, "learning_rate": 3.777549571772971e-05, "loss": 0.8401, "step": 582 }, { "epoch": 0.16046239592651207, "grad_norm": 0.44457618940068655, "learning_rate": 3.776748042201968e-05, "loss": 0.8659, "step": 583 }, { "epoch": 0.16073763159705498, "grad_norm": 0.45988463481003333, "learning_rate": 3.775945156548196e-05, "loss": 0.8532, "step": 584 }, { "epoch": 0.1610128672675979, "grad_norm": 0.4618503545272013, "learning_rate": 3.77514091542445e-05, "loss": 0.8598, "step": 585 }, { "epoch": 0.16128810293814078, "grad_norm": 0.6234601884083827, "learning_rate": 3.774335319444558e-05, "loss": 0.829, "step": 586 }, { "epoch": 0.1615633386086837, "grad_norm": 0.47049098450575466, "learning_rate": 3.773528369223382e-05, "loss": 0.9023, "step": 587 }, { "epoch": 0.1618385742792266, "grad_norm": 0.44398244533292125, "learning_rate": 3.772720065376817e-05, "loss": 0.8582, "step": 588 }, { "epoch": 0.16211380994976948, "grad_norm": 0.629968731666768, "learning_rate": 3.771910408521792e-05, "loss": 0.8834, "step": 589 }, { "epoch": 0.1623890456203124, "grad_norm": 0.539380515557297, "learning_rate": 3.771099399276268e-05, "loss": 0.8411, "step": 590 }, { "epoch": 0.1626642812908553, "grad_norm": 0.4861709192806457, "learning_rate": 3.7702870382592394e-05, "loss": 0.8781, "step": 591 }, { "epoch": 0.1629395169613982, "grad_norm": 0.42621808860582056, "learning_rate": 3.769473326090731e-05, "loss": 0.8651, "step": 592 }, { "epoch": 0.1632147526319411, "grad_norm": 0.4380200683345255, "learning_rate": 3.768658263391799e-05, "loss": 0.8723, "step": 593 }, { "epoch": 0.163489988302484, "grad_norm": 0.451153728165318, "learning_rate": 3.7678418507845316e-05, "loss": 0.8783, "step": 594 }, { "epoch": 0.1637652239730269, "grad_norm": 0.422539721328803, "learning_rate": 3.767024088892046e-05, "loss": 0.8623, "step": 595 }, { "epoch": 0.1640404596435698, "grad_norm": 0.449545734716924, "learning_rate": 3.76620497833849e-05, "loss": 0.8816, "step": 596 }, { "epoch": 0.16431569531411272, "grad_norm": 0.4656625878247244, "learning_rate": 3.76538451974904e-05, "loss": 0.8622, "step": 597 }, { "epoch": 0.1645909309846556, "grad_norm": 0.4118909765823919, "learning_rate": 3.764562713749902e-05, "loss": 0.855, "step": 598 }, { "epoch": 0.16486616665519851, "grad_norm": 0.43039838699812394, "learning_rate": 3.7637395609683093e-05, "loss": 0.8899, "step": 599 }, { "epoch": 0.16514140232574143, "grad_norm": 0.418011137759663, "learning_rate": 3.7629150620325255e-05, "loss": 0.8529, "step": 600 }, { "epoch": 0.1654166379962843, "grad_norm": 0.46295060796973236, "learning_rate": 3.762089217571839e-05, "loss": 0.8591, "step": 601 }, { "epoch": 0.16569187366682722, "grad_norm": 0.4136084190087584, "learning_rate": 3.761262028216566e-05, "loss": 0.8364, "step": 602 }, { "epoch": 0.16596710933737013, "grad_norm": 0.41656759129172366, "learning_rate": 3.76043349459805e-05, "loss": 0.8951, "step": 603 }, { "epoch": 0.16624234500791302, "grad_norm": 0.440927658691817, "learning_rate": 3.75960361734866e-05, "loss": 0.8554, "step": 604 }, { "epoch": 0.16651758067845593, "grad_norm": 0.43515328904506384, "learning_rate": 3.75877239710179e-05, "loss": 0.8583, "step": 605 }, { "epoch": 0.16679281634899884, "grad_norm": 0.3819785628991007, "learning_rate": 3.757939834491858e-05, "loss": 0.8571, "step": 606 }, { "epoch": 0.16706805201954172, "grad_norm": 0.3902177082821287, "learning_rate": 3.7571059301543104e-05, "loss": 0.8468, "step": 607 }, { "epoch": 0.16734328769008464, "grad_norm": 0.49812295204183166, "learning_rate": 3.756270684725614e-05, "loss": 0.8362, "step": 608 }, { "epoch": 0.16761852336062755, "grad_norm": 0.3995156875131695, "learning_rate": 3.7554340988432606e-05, "loss": 0.8662, "step": 609 }, { "epoch": 0.16789375903117043, "grad_norm": 0.42250602723841496, "learning_rate": 3.754596173145765e-05, "loss": 0.8326, "step": 610 }, { "epoch": 0.16816899470171334, "grad_norm": 0.42173339037171914, "learning_rate": 3.7537569082726645e-05, "loss": 0.8757, "step": 611 }, { "epoch": 0.16844423037225625, "grad_norm": 0.36688653992050907, "learning_rate": 3.7529163048645175e-05, "loss": 0.8264, "step": 612 }, { "epoch": 0.16871946604279914, "grad_norm": 0.43238645464559056, "learning_rate": 3.752074363562907e-05, "loss": 0.8422, "step": 613 }, { "epoch": 0.16899470171334205, "grad_norm": 0.38387386608170954, "learning_rate": 3.751231085010433e-05, "loss": 0.8252, "step": 614 }, { "epoch": 0.16926993738388496, "grad_norm": 0.42617280811410624, "learning_rate": 3.750386469850719e-05, "loss": 0.8181, "step": 615 }, { "epoch": 0.16954517305442784, "grad_norm": 0.45116202185755655, "learning_rate": 3.749540518728409e-05, "loss": 0.8636, "step": 616 }, { "epoch": 0.16982040872497076, "grad_norm": 0.41685101305431854, "learning_rate": 3.7486932322891646e-05, "loss": 0.8295, "step": 617 }, { "epoch": 0.17009564439551367, "grad_norm": 0.4028147769083077, "learning_rate": 3.7478446111796676e-05, "loss": 0.829, "step": 618 }, { "epoch": 0.17037088006605655, "grad_norm": 0.39845488253498856, "learning_rate": 3.746994656047618e-05, "loss": 0.8497, "step": 619 }, { "epoch": 0.17064611573659946, "grad_norm": 0.45034021511122285, "learning_rate": 3.746143367541736e-05, "loss": 0.8846, "step": 620 }, { "epoch": 0.17092135140714237, "grad_norm": 0.4333175526376847, "learning_rate": 3.745290746311756e-05, "loss": 0.8352, "step": 621 }, { "epoch": 0.17119658707768526, "grad_norm": 0.43033481461769457, "learning_rate": 3.7444367930084324e-05, "loss": 0.8601, "step": 622 }, { "epoch": 0.17147182274822817, "grad_norm": 0.49429130468760024, "learning_rate": 3.7435815082835356e-05, "loss": 0.8546, "step": 623 }, { "epoch": 0.17174705841877108, "grad_norm": 0.44088834343857985, "learning_rate": 3.742724892789851e-05, "loss": 0.8461, "step": 624 }, { "epoch": 0.17202229408931397, "grad_norm": 0.41294866044719825, "learning_rate": 3.7418669471811815e-05, "loss": 0.8269, "step": 625 }, { "epoch": 0.17229752975985688, "grad_norm": 1.0199283661633092, "learning_rate": 3.741007672112345e-05, "loss": 0.8696, "step": 626 }, { "epoch": 0.1725727654303998, "grad_norm": 0.5058421556080241, "learning_rate": 3.740147068239171e-05, "loss": 0.8341, "step": 627 }, { "epoch": 0.17284800110094267, "grad_norm": 0.3778503710292955, "learning_rate": 3.739285136218508e-05, "loss": 0.8434, "step": 628 }, { "epoch": 0.17312323677148558, "grad_norm": 0.4399140389760773, "learning_rate": 3.738421876708215e-05, "loss": 0.83, "step": 629 }, { "epoch": 0.1733984724420285, "grad_norm": 0.5016172625218125, "learning_rate": 3.7375572903671654e-05, "loss": 0.8696, "step": 630 }, { "epoch": 0.17367370811257138, "grad_norm": 0.456216673669721, "learning_rate": 3.736691377855243e-05, "loss": 0.8685, "step": 631 }, { "epoch": 0.1739489437831143, "grad_norm": 0.4548718012725048, "learning_rate": 3.735824139833349e-05, "loss": 0.8373, "step": 632 }, { "epoch": 0.1742241794536572, "grad_norm": 0.49875651019797274, "learning_rate": 3.7349555769633905e-05, "loss": 0.8363, "step": 633 }, { "epoch": 0.17449941512420009, "grad_norm": 0.44963026053063526, "learning_rate": 3.7340856899082885e-05, "loss": 0.8564, "step": 634 }, { "epoch": 0.174774650794743, "grad_norm": 0.4499071962134856, "learning_rate": 3.733214479331976e-05, "loss": 0.8736, "step": 635 }, { "epoch": 0.1750498864652859, "grad_norm": 0.5002395296179322, "learning_rate": 3.732341945899392e-05, "loss": 0.8565, "step": 636 }, { "epoch": 0.1753251221358288, "grad_norm": 0.48806349949471, "learning_rate": 3.73146809027649e-05, "loss": 0.8958, "step": 637 }, { "epoch": 0.1756003578063717, "grad_norm": 1.3382369778664218, "learning_rate": 3.7305929131302295e-05, "loss": 0.862, "step": 638 }, { "epoch": 0.17587559347691462, "grad_norm": 0.5389643821953385, "learning_rate": 3.7297164151285784e-05, "loss": 0.867, "step": 639 }, { "epoch": 0.1761508291474575, "grad_norm": 2.9563400865736167, "learning_rate": 3.7288385969405165e-05, "loss": 0.8561, "step": 640 }, { "epoch": 0.1764260648180004, "grad_norm": 0.8251866766193862, "learning_rate": 3.7279594592360265e-05, "loss": 0.87, "step": 641 }, { "epoch": 0.17670130048854332, "grad_norm": 1.2182716955143615, "learning_rate": 3.7270790026861016e-05, "loss": 0.8344, "step": 642 }, { "epoch": 0.1769765361590862, "grad_norm": 0.7097183478086542, "learning_rate": 3.726197227962738e-05, "loss": 0.8327, "step": 643 }, { "epoch": 0.17725177182962912, "grad_norm": 0.8429420008347591, "learning_rate": 3.725314135738943e-05, "loss": 0.8435, "step": 644 }, { "epoch": 0.17752700750017203, "grad_norm": 0.8648986343209424, "learning_rate": 3.724429726688725e-05, "loss": 0.8657, "step": 645 }, { "epoch": 0.1778022431707149, "grad_norm": 0.754418945875783, "learning_rate": 3.7235440014870994e-05, "loss": 0.8107, "step": 646 }, { "epoch": 0.17807747884125782, "grad_norm": 0.7183465853403834, "learning_rate": 3.7226569608100866e-05, "loss": 0.8672, "step": 647 }, { "epoch": 0.17835271451180074, "grad_norm": 0.5483584662617327, "learning_rate": 3.72176860533471e-05, "loss": 0.839, "step": 648 }, { "epoch": 0.17862795018234362, "grad_norm": 0.6322640426864581, "learning_rate": 3.720878935738996e-05, "loss": 0.8734, "step": 649 }, { "epoch": 0.17890318585288653, "grad_norm": 0.7030573116202438, "learning_rate": 3.719987952701976e-05, "loss": 0.8489, "step": 650 }, { "epoch": 0.17917842152342944, "grad_norm": 0.6555737552414294, "learning_rate": 3.7190956569036825e-05, "loss": 0.8425, "step": 651 }, { "epoch": 0.17945365719397233, "grad_norm": 0.5482130233720278, "learning_rate": 3.718202049025149e-05, "loss": 0.8332, "step": 652 }, { "epoch": 0.17972889286451524, "grad_norm": 0.6035641346252707, "learning_rate": 3.717307129748413e-05, "loss": 0.8634, "step": 653 }, { "epoch": 0.18000412853505815, "grad_norm": 0.4963480276691412, "learning_rate": 3.71641089975651e-05, "loss": 0.8491, "step": 654 }, { "epoch": 0.18027936420560103, "grad_norm": 0.5258399669126971, "learning_rate": 3.715513359733479e-05, "loss": 0.8556, "step": 655 }, { "epoch": 0.18055459987614395, "grad_norm": 0.4454387243275687, "learning_rate": 3.7146145103643564e-05, "loss": 0.8608, "step": 656 }, { "epoch": 0.18082983554668686, "grad_norm": 0.6320853066798535, "learning_rate": 3.7137143523351787e-05, "loss": 0.8599, "step": 657 }, { "epoch": 0.18110507121722974, "grad_norm": 0.46435375045636657, "learning_rate": 3.712812886332982e-05, "loss": 0.8246, "step": 658 }, { "epoch": 0.18138030688777265, "grad_norm": 0.4475002613859921, "learning_rate": 3.7119101130457986e-05, "loss": 0.8496, "step": 659 }, { "epoch": 0.18165554255831556, "grad_norm": 0.4506156959521839, "learning_rate": 3.7110060331626605e-05, "loss": 0.8511, "step": 660 }, { "epoch": 0.18193077822885845, "grad_norm": 0.4521781865366933, "learning_rate": 3.710100647373597e-05, "loss": 0.8605, "step": 661 }, { "epoch": 0.18220601389940136, "grad_norm": 0.4469760583626412, "learning_rate": 3.7091939563696343e-05, "loss": 0.8713, "step": 662 }, { "epoch": 0.18248124956994427, "grad_norm": 0.4490346194814551, "learning_rate": 3.708285960842792e-05, "loss": 0.8406, "step": 663 }, { "epoch": 0.18275648524048715, "grad_norm": 0.45557395803295825, "learning_rate": 3.707376661486088e-05, "loss": 0.8568, "step": 664 }, { "epoch": 0.18303172091103007, "grad_norm": 0.5221170090389651, "learning_rate": 3.7064660589935356e-05, "loss": 0.8584, "step": 665 }, { "epoch": 0.18330695658157298, "grad_norm": 0.3958832022301931, "learning_rate": 3.7055541540601414e-05, "loss": 0.8346, "step": 666 }, { "epoch": 0.18358219225211586, "grad_norm": 0.4171414706934924, "learning_rate": 3.704640947381905e-05, "loss": 0.8545, "step": 667 }, { "epoch": 0.18385742792265877, "grad_norm": 0.5460493752890742, "learning_rate": 3.7037264396558234e-05, "loss": 0.8679, "step": 668 }, { "epoch": 0.18413266359320168, "grad_norm": 0.4301601757585685, "learning_rate": 3.7028106315798835e-05, "loss": 0.851, "step": 669 }, { "epoch": 0.18440789926374457, "grad_norm": 0.6001559118913008, "learning_rate": 3.7018935238530646e-05, "loss": 0.8401, "step": 670 }, { "epoch": 0.18468313493428748, "grad_norm": 0.4367149397988626, "learning_rate": 3.700975117175339e-05, "loss": 0.8432, "step": 671 }, { "epoch": 0.1849583706048304, "grad_norm": 0.39430011051376235, "learning_rate": 3.700055412247671e-05, "loss": 0.8551, "step": 672 }, { "epoch": 0.18523360627537327, "grad_norm": 0.3994623447630073, "learning_rate": 3.699134409772014e-05, "loss": 0.8403, "step": 673 }, { "epoch": 0.1855088419459162, "grad_norm": 0.4239377062368287, "learning_rate": 3.698212110451313e-05, "loss": 0.8532, "step": 674 }, { "epoch": 0.1857840776164591, "grad_norm": 0.42984171548912564, "learning_rate": 3.697288514989502e-05, "loss": 0.8563, "step": 675 }, { "epoch": 0.18605931328700198, "grad_norm": 0.4182100781010544, "learning_rate": 3.696363624091506e-05, "loss": 0.8265, "step": 676 }, { "epoch": 0.1863345489575449, "grad_norm": 0.42805379999521154, "learning_rate": 3.6954374384632364e-05, "loss": 0.8493, "step": 677 }, { "epoch": 0.1866097846280878, "grad_norm": 0.4592615741308747, "learning_rate": 3.6945099588115945e-05, "loss": 0.8312, "step": 678 }, { "epoch": 0.18688502029863072, "grad_norm": 0.41069079842461964, "learning_rate": 3.693581185844468e-05, "loss": 0.8698, "step": 679 }, { "epoch": 0.1871602559691736, "grad_norm": 0.39478483232062667, "learning_rate": 3.692651120270733e-05, "loss": 0.8495, "step": 680 }, { "epoch": 0.1874354916397165, "grad_norm": 0.4148331618032844, "learning_rate": 3.691719762800251e-05, "loss": 0.8364, "step": 681 }, { "epoch": 0.18771072731025942, "grad_norm": 0.39613157370235597, "learning_rate": 3.690787114143869e-05, "loss": 0.8362, "step": 682 }, { "epoch": 0.1879859629808023, "grad_norm": 0.38213320383152727, "learning_rate": 3.689853175013423e-05, "loss": 0.8596, "step": 683 }, { "epoch": 0.18826119865134522, "grad_norm": 0.4216344319714442, "learning_rate": 3.6889179461217295e-05, "loss": 0.8066, "step": 684 }, { "epoch": 0.18853643432188813, "grad_norm": 0.4519848330232041, "learning_rate": 3.6879814281825924e-05, "loss": 0.8343, "step": 685 }, { "epoch": 0.188811669992431, "grad_norm": 0.4840066654563424, "learning_rate": 3.687043621910798e-05, "loss": 0.889, "step": 686 }, { "epoch": 0.18908690566297393, "grad_norm": 0.4771316148312613, "learning_rate": 3.6861045280221153e-05, "loss": 0.8536, "step": 687 }, { "epoch": 0.18936214133351684, "grad_norm": 0.4100103260938992, "learning_rate": 3.6851641472332985e-05, "loss": 0.8478, "step": 688 }, { "epoch": 0.18963737700405972, "grad_norm": 0.39765616123941616, "learning_rate": 3.684222480262082e-05, "loss": 0.8423, "step": 689 }, { "epoch": 0.18991261267460263, "grad_norm": 0.42154783908146215, "learning_rate": 3.683279527827182e-05, "loss": 0.8498, "step": 690 }, { "epoch": 0.19018784834514554, "grad_norm": 0.4393589996269681, "learning_rate": 3.682335290648297e-05, "loss": 0.8658, "step": 691 }, { "epoch": 0.19046308401568843, "grad_norm": 0.45500979119470325, "learning_rate": 3.6813897694461045e-05, "loss": 0.836, "step": 692 }, { "epoch": 0.19073831968623134, "grad_norm": 0.47142718952846213, "learning_rate": 3.6804429649422636e-05, "loss": 0.8267, "step": 693 }, { "epoch": 0.19101355535677425, "grad_norm": 0.4512480273163847, "learning_rate": 3.679494877859412e-05, "loss": 0.8418, "step": 694 }, { "epoch": 0.19128879102731713, "grad_norm": 0.47736619973214345, "learning_rate": 3.678545508921166e-05, "loss": 0.8421, "step": 695 }, { "epoch": 0.19156402669786005, "grad_norm": 0.4147369289774733, "learning_rate": 3.67759485885212e-05, "loss": 0.8729, "step": 696 }, { "epoch": 0.19183926236840296, "grad_norm": 0.38610782159182894, "learning_rate": 3.676642928377849e-05, "loss": 0.8418, "step": 697 }, { "epoch": 0.19211449803894584, "grad_norm": 0.5074722657389754, "learning_rate": 3.675689718224901e-05, "loss": 0.8565, "step": 698 }, { "epoch": 0.19238973370948875, "grad_norm": 0.41315633992474105, "learning_rate": 3.674735229120804e-05, "loss": 0.8436, "step": 699 }, { "epoch": 0.19266496938003166, "grad_norm": 0.4569749943059334, "learning_rate": 3.6737794617940604e-05, "loss": 0.8704, "step": 700 }, { "epoch": 0.19294020505057455, "grad_norm": 0.4358495054803389, "learning_rate": 3.672822416974149e-05, "loss": 0.8384, "step": 701 }, { "epoch": 0.19321544072111746, "grad_norm": 0.3658382889556743, "learning_rate": 3.671864095391523e-05, "loss": 0.8641, "step": 702 }, { "epoch": 0.19349067639166037, "grad_norm": 0.4074752202677212, "learning_rate": 3.670904497777611e-05, "loss": 0.8373, "step": 703 }, { "epoch": 0.19376591206220325, "grad_norm": 0.43237299773290594, "learning_rate": 3.669943624864815e-05, "loss": 0.8224, "step": 704 }, { "epoch": 0.19404114773274617, "grad_norm": 0.4003017314125147, "learning_rate": 3.6689814773865103e-05, "loss": 0.8332, "step": 705 }, { "epoch": 0.19431638340328908, "grad_norm": 0.42792498361506986, "learning_rate": 3.6680180560770445e-05, "loss": 0.845, "step": 706 }, { "epoch": 0.19459161907383196, "grad_norm": 0.4460903177214098, "learning_rate": 3.667053361671738e-05, "loss": 0.8239, "step": 707 }, { "epoch": 0.19486685474437487, "grad_norm": 0.4235211722879677, "learning_rate": 3.6660873949068846e-05, "loss": 0.841, "step": 708 }, { "epoch": 0.19514209041491778, "grad_norm": 0.41710277993607175, "learning_rate": 3.665120156519745e-05, "loss": 0.8402, "step": 709 }, { "epoch": 0.19541732608546067, "grad_norm": 0.4357796505973876, "learning_rate": 3.6641516472485544e-05, "loss": 0.827, "step": 710 }, { "epoch": 0.19569256175600358, "grad_norm": 0.4222776705197388, "learning_rate": 3.663181867832515e-05, "loss": 0.8554, "step": 711 }, { "epoch": 0.1959677974265465, "grad_norm": 0.42764490948061673, "learning_rate": 3.662210819011802e-05, "loss": 0.8951, "step": 712 }, { "epoch": 0.19624303309708938, "grad_norm": 0.5324227250448954, "learning_rate": 3.661238501527556e-05, "loss": 0.8095, "step": 713 }, { "epoch": 0.1965182687676323, "grad_norm": 0.4212322505107573, "learning_rate": 3.660264916121888e-05, "loss": 0.8336, "step": 714 }, { "epoch": 0.1967935044381752, "grad_norm": 0.574041932056587, "learning_rate": 3.659290063537875e-05, "loss": 0.8405, "step": 715 }, { "epoch": 0.19706874010871808, "grad_norm": 0.5027051478358007, "learning_rate": 3.658313944519564e-05, "loss": 0.8664, "step": 716 }, { "epoch": 0.197343975779261, "grad_norm": 0.5262137971164996, "learning_rate": 3.657336559811965e-05, "loss": 0.8487, "step": 717 }, { "epoch": 0.1976192114498039, "grad_norm": 0.4758930292255706, "learning_rate": 3.6563579101610566e-05, "loss": 0.8327, "step": 718 }, { "epoch": 0.1978944471203468, "grad_norm": 0.5446657401501271, "learning_rate": 3.655377996313782e-05, "loss": 0.855, "step": 719 }, { "epoch": 0.1981696827908897, "grad_norm": 0.4796186891971015, "learning_rate": 3.654396819018048e-05, "loss": 0.8481, "step": 720 }, { "epoch": 0.1984449184614326, "grad_norm": 0.48475451769115785, "learning_rate": 3.653414379022729e-05, "loss": 0.8354, "step": 721 }, { "epoch": 0.1987201541319755, "grad_norm": 0.43354817713783983, "learning_rate": 3.6524306770776606e-05, "loss": 0.8626, "step": 722 }, { "epoch": 0.1989953898025184, "grad_norm": 0.40858146621787894, "learning_rate": 3.651445713933641e-05, "loss": 0.8201, "step": 723 }, { "epoch": 0.19927062547306132, "grad_norm": 0.46739570129985547, "learning_rate": 3.6504594903424335e-05, "loss": 0.8402, "step": 724 }, { "epoch": 0.1995458611436042, "grad_norm": 0.5960198370070712, "learning_rate": 3.649472007056762e-05, "loss": 0.8551, "step": 725 }, { "epoch": 0.19982109681414711, "grad_norm": 0.4249329905811914, "learning_rate": 3.648483264830311e-05, "loss": 0.8469, "step": 726 }, { "epoch": 0.20009633248469003, "grad_norm": 0.3884594476823848, "learning_rate": 3.647493264417727e-05, "loss": 0.846, "step": 727 }, { "epoch": 0.2003715681552329, "grad_norm": 0.43771188863603633, "learning_rate": 3.6465020065746174e-05, "loss": 0.8554, "step": 728 }, { "epoch": 0.20064680382577582, "grad_norm": 0.414388470066882, "learning_rate": 3.645509492057548e-05, "loss": 0.8526, "step": 729 }, { "epoch": 0.20092203949631873, "grad_norm": 0.4381310106126896, "learning_rate": 3.6445157216240434e-05, "loss": 0.8125, "step": 730 }, { "epoch": 0.20119727516686162, "grad_norm": 0.4489280588883335, "learning_rate": 3.6435206960325884e-05, "loss": 0.8379, "step": 731 }, { "epoch": 0.20147251083740453, "grad_norm": 0.4104386042629172, "learning_rate": 3.6425244160426257e-05, "loss": 0.8611, "step": 732 }, { "epoch": 0.20174774650794744, "grad_norm": 0.43649844835285395, "learning_rate": 3.641526882414553e-05, "loss": 0.8358, "step": 733 }, { "epoch": 0.20202298217849032, "grad_norm": 0.4785645068198297, "learning_rate": 3.640528095909728e-05, "loss": 0.8167, "step": 734 }, { "epoch": 0.20229821784903324, "grad_norm": 0.4801829819148601, "learning_rate": 3.6395280572904624e-05, "loss": 0.842, "step": 735 }, { "epoch": 0.20257345351957615, "grad_norm": 0.36946401326823053, "learning_rate": 3.6385267673200247e-05, "loss": 0.8602, "step": 736 }, { "epoch": 0.20284868919011903, "grad_norm": 0.4071178451879595, "learning_rate": 3.6375242267626374e-05, "loss": 0.8362, "step": 737 }, { "epoch": 0.20312392486066194, "grad_norm": 0.4626094932446769, "learning_rate": 3.636520436383479e-05, "loss": 0.85, "step": 738 }, { "epoch": 0.20339916053120485, "grad_norm": 0.4444176379580745, "learning_rate": 3.635515396948681e-05, "loss": 0.8418, "step": 739 }, { "epoch": 0.20367439620174774, "grad_norm": 0.39943612934790257, "learning_rate": 3.634509109225328e-05, "loss": 0.8176, "step": 740 }, { "epoch": 0.20394963187229065, "grad_norm": 0.42746362491863327, "learning_rate": 3.633501573981458e-05, "loss": 0.8221, "step": 741 }, { "epoch": 0.20422486754283356, "grad_norm": 0.43955769005816575, "learning_rate": 3.6324927919860605e-05, "loss": 0.8418, "step": 742 }, { "epoch": 0.20450010321337644, "grad_norm": 0.40136283800513434, "learning_rate": 3.631482764009077e-05, "loss": 0.8597, "step": 743 }, { "epoch": 0.20477533888391936, "grad_norm": 0.4342336125734908, "learning_rate": 3.6304714908214005e-05, "loss": 0.8522, "step": 744 }, { "epoch": 0.20505057455446227, "grad_norm": 0.39796792057967034, "learning_rate": 3.629458973194872e-05, "loss": 0.8314, "step": 745 }, { "epoch": 0.20532581022500515, "grad_norm": 0.36486618459201503, "learning_rate": 3.6284452119022864e-05, "loss": 0.8294, "step": 746 }, { "epoch": 0.20560104589554806, "grad_norm": 0.42153144887888017, "learning_rate": 3.627430207717384e-05, "loss": 0.8476, "step": 747 }, { "epoch": 0.20587628156609097, "grad_norm": 0.47187906267962265, "learning_rate": 3.626413961414856e-05, "loss": 0.8607, "step": 748 }, { "epoch": 0.20615151723663386, "grad_norm": 0.571093129302097, "learning_rate": 3.62539647377034e-05, "loss": 0.849, "step": 749 }, { "epoch": 0.20642675290717677, "grad_norm": 0.39155079928047015, "learning_rate": 3.624377745560423e-05, "loss": 0.8316, "step": 750 }, { "epoch": 0.20670198857771968, "grad_norm": 0.4626947473628133, "learning_rate": 3.6233577775626364e-05, "loss": 0.8106, "step": 751 }, { "epoch": 0.20697722424826256, "grad_norm": 0.3825681798940843, "learning_rate": 3.62233657055546e-05, "loss": 0.8505, "step": 752 }, { "epoch": 0.20725245991880548, "grad_norm": 0.8103835841494073, "learning_rate": 3.621314125318319e-05, "loss": 0.8371, "step": 753 }, { "epoch": 0.2075276955893484, "grad_norm": 0.544841934052842, "learning_rate": 3.620290442631581e-05, "loss": 0.8554, "step": 754 }, { "epoch": 0.20780293125989127, "grad_norm": 0.4640226605335836, "learning_rate": 3.619265523276563e-05, "loss": 0.8572, "step": 755 }, { "epoch": 0.20807816693043418, "grad_norm": 0.4142139456870526, "learning_rate": 3.6182393680355215e-05, "loss": 0.8407, "step": 756 }, { "epoch": 0.2083534026009771, "grad_norm": 0.41231709603275885, "learning_rate": 3.6172119776916574e-05, "loss": 0.8627, "step": 757 }, { "epoch": 0.20862863827151998, "grad_norm": 0.42718139122733884, "learning_rate": 3.616183353029116e-05, "loss": 0.8542, "step": 758 }, { "epoch": 0.2089038739420629, "grad_norm": 0.42746219507199107, "learning_rate": 3.615153494832982e-05, "loss": 0.8455, "step": 759 }, { "epoch": 0.2091791096126058, "grad_norm": 0.39937711944269977, "learning_rate": 3.6141224038892844e-05, "loss": 0.8575, "step": 760 }, { "epoch": 0.20945434528314869, "grad_norm": 0.40072257740751366, "learning_rate": 3.613090080984991e-05, "loss": 0.8381, "step": 761 }, { "epoch": 0.2097295809536916, "grad_norm": 0.4458910693132313, "learning_rate": 3.6120565269080106e-05, "loss": 0.8407, "step": 762 }, { "epoch": 0.2100048166242345, "grad_norm": 0.407923953462484, "learning_rate": 3.611021742447191e-05, "loss": 0.8258, "step": 763 }, { "epoch": 0.2102800522947774, "grad_norm": 0.4238305847658918, "learning_rate": 3.6099857283923207e-05, "loss": 0.813, "step": 764 }, { "epoch": 0.2105552879653203, "grad_norm": 0.47777018385121184, "learning_rate": 3.608948485534125e-05, "loss": 0.8392, "step": 765 }, { "epoch": 0.21083052363586322, "grad_norm": 0.3967502537158732, "learning_rate": 3.607910014664268e-05, "loss": 0.8059, "step": 766 }, { "epoch": 0.2111057593064061, "grad_norm": 0.4237758123471388, "learning_rate": 3.60687031657535e-05, "loss": 0.8245, "step": 767 }, { "epoch": 0.211380994976949, "grad_norm": 0.4774666222648608, "learning_rate": 3.60582939206091e-05, "loss": 0.8135, "step": 768 }, { "epoch": 0.21165623064749192, "grad_norm": 0.446412856116806, "learning_rate": 3.6047872419154214e-05, "loss": 0.8272, "step": 769 }, { "epoch": 0.2119314663180348, "grad_norm": 0.39678718268544694, "learning_rate": 3.603743866934293e-05, "loss": 0.8194, "step": 770 }, { "epoch": 0.21220670198857772, "grad_norm": 0.4710935686138563, "learning_rate": 3.60269926791387e-05, "loss": 0.8534, "step": 771 }, { "epoch": 0.21248193765912063, "grad_norm": 0.4720495079718069, "learning_rate": 3.60165344565143e-05, "loss": 0.8632, "step": 772 }, { "epoch": 0.2127571733296635, "grad_norm": 0.37049250958887253, "learning_rate": 3.600606400945184e-05, "loss": 0.8287, "step": 773 }, { "epoch": 0.21303240900020642, "grad_norm": 0.49193622376753743, "learning_rate": 3.5995581345942783e-05, "loss": 0.8417, "step": 774 }, { "epoch": 0.21330764467074934, "grad_norm": 0.5202575001687807, "learning_rate": 3.5985086473987905e-05, "loss": 0.8315, "step": 775 }, { "epoch": 0.21358288034129222, "grad_norm": 0.39311401189204315, "learning_rate": 3.597457940159728e-05, "loss": 0.8196, "step": 776 }, { "epoch": 0.21385811601183513, "grad_norm": 0.40245893709363184, "learning_rate": 3.596406013679034e-05, "loss": 0.8363, "step": 777 }, { "epoch": 0.21413335168237804, "grad_norm": 0.45702828332856205, "learning_rate": 3.595352868759577e-05, "loss": 0.8311, "step": 778 }, { "epoch": 0.21440858735292093, "grad_norm": 0.38752198413705274, "learning_rate": 3.5942985062051584e-05, "loss": 0.8729, "step": 779 }, { "epoch": 0.21468382302346384, "grad_norm": 0.4205086360306346, "learning_rate": 3.593242926820509e-05, "loss": 0.8396, "step": 780 }, { "epoch": 0.21495905869400675, "grad_norm": 0.41145441647441555, "learning_rate": 3.592186131411288e-05, "loss": 0.8424, "step": 781 }, { "epoch": 0.21523429436454963, "grad_norm": 0.4291218418547524, "learning_rate": 3.591128120784081e-05, "loss": 0.8341, "step": 782 }, { "epoch": 0.21550953003509254, "grad_norm": 0.4155079935445456, "learning_rate": 3.590068895746405e-05, "loss": 0.8526, "step": 783 }, { "epoch": 0.21578476570563546, "grad_norm": 1.1445884515085594, "learning_rate": 3.589008457106699e-05, "loss": 0.8226, "step": 784 }, { "epoch": 0.21606000137617834, "grad_norm": 0.3764159673685187, "learning_rate": 3.587946805674333e-05, "loss": 0.8361, "step": 785 }, { "epoch": 0.21633523704672125, "grad_norm": 0.4766390433135783, "learning_rate": 3.5868839422595984e-05, "loss": 0.8187, "step": 786 }, { "epoch": 0.21661047271726416, "grad_norm": 0.5686529336641953, "learning_rate": 3.5858198676737146e-05, "loss": 0.8436, "step": 787 }, { "epoch": 0.21688570838780705, "grad_norm": 0.39493700305912877, "learning_rate": 3.5847545827288245e-05, "loss": 0.857, "step": 788 }, { "epoch": 0.21716094405834996, "grad_norm": 0.4051005714111124, "learning_rate": 3.583688088237995e-05, "loss": 0.8429, "step": 789 }, { "epoch": 0.21743617972889287, "grad_norm": 0.41283129410754954, "learning_rate": 3.582620385015215e-05, "loss": 0.8374, "step": 790 }, { "epoch": 0.21771141539943575, "grad_norm": 0.39826594291908823, "learning_rate": 3.581551473875397e-05, "loss": 0.8274, "step": 791 }, { "epoch": 0.21798665106997867, "grad_norm": 0.4020013292191615, "learning_rate": 3.5804813556343764e-05, "loss": 0.8187, "step": 792 }, { "epoch": 0.21826188674052158, "grad_norm": 0.48337407202357424, "learning_rate": 3.579410031108908e-05, "loss": 0.8478, "step": 793 }, { "epoch": 0.21853712241106446, "grad_norm": 0.43589893225941345, "learning_rate": 3.578337501116668e-05, "loss": 0.8183, "step": 794 }, { "epoch": 0.21881235808160737, "grad_norm": 0.3969535610324806, "learning_rate": 3.577263766476253e-05, "loss": 0.8091, "step": 795 }, { "epoch": 0.21908759375215028, "grad_norm": 0.4299895238949179, "learning_rate": 3.576188828007178e-05, "loss": 0.8381, "step": 796 }, { "epoch": 0.21936282942269317, "grad_norm": 0.46392382864977694, "learning_rate": 3.575112686529879e-05, "loss": 0.8407, "step": 797 }, { "epoch": 0.21963806509323608, "grad_norm": 0.4419500179183648, "learning_rate": 3.5740353428657075e-05, "loss": 0.8226, "step": 798 }, { "epoch": 0.219913300763779, "grad_norm": 0.3947764659546133, "learning_rate": 3.572956797836934e-05, "loss": 0.8247, "step": 799 }, { "epoch": 0.22018853643432187, "grad_norm": 0.44553746290385293, "learning_rate": 3.571877052266747e-05, "loss": 0.8194, "step": 800 }, { "epoch": 0.22046377210486479, "grad_norm": 0.6022734575775616, "learning_rate": 3.5707961069792483e-05, "loss": 0.8232, "step": 801 }, { "epoch": 0.2207390077754077, "grad_norm": 0.40624940254379577, "learning_rate": 3.5697139627994585e-05, "loss": 0.8157, "step": 802 }, { "epoch": 0.2210142434459506, "grad_norm": 0.3995554611453746, "learning_rate": 3.568630620553311e-05, "loss": 0.7882, "step": 803 }, { "epoch": 0.2212894791164935, "grad_norm": 0.434957866603094, "learning_rate": 3.567546081067654e-05, "loss": 0.8205, "step": 804 }, { "epoch": 0.2215647147870364, "grad_norm": 0.4411153780823904, "learning_rate": 3.566460345170252e-05, "loss": 0.8203, "step": 805 }, { "epoch": 0.22183995045757932, "grad_norm": 0.3631520885030517, "learning_rate": 3.565373413689779e-05, "loss": 0.824, "step": 806 }, { "epoch": 0.2221151861281222, "grad_norm": 0.37775935748126854, "learning_rate": 3.5642852874558224e-05, "loss": 0.8335, "step": 807 }, { "epoch": 0.2223904217986651, "grad_norm": 0.41532774700032266, "learning_rate": 3.563195967298884e-05, "loss": 0.8428, "step": 808 }, { "epoch": 0.22266565746920802, "grad_norm": 0.43219370366574045, "learning_rate": 3.5621054540503736e-05, "loss": 0.844, "step": 809 }, { "epoch": 0.2229408931397509, "grad_norm": 0.4192748580747729, "learning_rate": 3.561013748542615e-05, "loss": 0.8239, "step": 810 }, { "epoch": 0.22321612881029382, "grad_norm": 0.4004945451578492, "learning_rate": 3.559920851608837e-05, "loss": 0.8288, "step": 811 }, { "epoch": 0.22349136448083673, "grad_norm": 0.37578485115688853, "learning_rate": 3.558826764083183e-05, "loss": 0.8174, "step": 812 }, { "epoch": 0.2237666001513796, "grad_norm": 0.3900818322817826, "learning_rate": 3.557731486800703e-05, "loss": 0.8362, "step": 813 }, { "epoch": 0.22404183582192252, "grad_norm": 0.4111267758159336, "learning_rate": 3.556635020597354e-05, "loss": 0.8534, "step": 814 }, { "epoch": 0.22431707149246544, "grad_norm": 0.3443605977234703, "learning_rate": 3.5555373663100015e-05, "loss": 0.8554, "step": 815 }, { "epoch": 0.22459230716300832, "grad_norm": 0.4087510461090667, "learning_rate": 3.554438524776418e-05, "loss": 0.847, "step": 816 }, { "epoch": 0.22486754283355123, "grad_norm": 0.3969115734804899, "learning_rate": 3.5533384968352816e-05, "loss": 0.812, "step": 817 }, { "epoch": 0.22514277850409414, "grad_norm": 0.3522044017362988, "learning_rate": 3.5522372833261764e-05, "loss": 0.8143, "step": 818 }, { "epoch": 0.22541801417463703, "grad_norm": 0.37087751427808874, "learning_rate": 3.55113488508959e-05, "loss": 0.8436, "step": 819 }, { "epoch": 0.22569324984517994, "grad_norm": 0.3922001870166564, "learning_rate": 3.550031302966918e-05, "loss": 0.8495, "step": 820 }, { "epoch": 0.22596848551572285, "grad_norm": 0.491203905706425, "learning_rate": 3.548926537800454e-05, "loss": 0.8486, "step": 821 }, { "epoch": 0.22624372118626573, "grad_norm": 0.39444069165635215, "learning_rate": 3.547820590433399e-05, "loss": 0.8125, "step": 822 }, { "epoch": 0.22651895685680865, "grad_norm": 0.3486731037800461, "learning_rate": 3.546713461709854e-05, "loss": 0.8501, "step": 823 }, { "epoch": 0.22679419252735156, "grad_norm": 0.38337616001967323, "learning_rate": 3.5456051524748234e-05, "loss": 0.8487, "step": 824 }, { "epoch": 0.22706942819789444, "grad_norm": 0.5097332013738162, "learning_rate": 3.5444956635742107e-05, "loss": 0.8557, "step": 825 }, { "epoch": 0.22734466386843735, "grad_norm": 0.3759475626959331, "learning_rate": 3.543384995854821e-05, "loss": 0.8445, "step": 826 }, { "epoch": 0.22761989953898026, "grad_norm": 0.3863457112218582, "learning_rate": 3.5422731501643595e-05, "loss": 0.8318, "step": 827 }, { "epoch": 0.22789513520952315, "grad_norm": 0.46392118866275983, "learning_rate": 3.541160127351429e-05, "loss": 0.8483, "step": 828 }, { "epoch": 0.22817037088006606, "grad_norm": 0.405655137806616, "learning_rate": 3.540045928265531e-05, "loss": 0.811, "step": 829 }, { "epoch": 0.22844560655060897, "grad_norm": 0.34745418798529587, "learning_rate": 3.538930553757067e-05, "loss": 0.8354, "step": 830 }, { "epoch": 0.22872084222115185, "grad_norm": 0.4540177345167058, "learning_rate": 3.5378140046773324e-05, "loss": 0.8434, "step": 831 }, { "epoch": 0.22899607789169477, "grad_norm": 0.3282915556725718, "learning_rate": 3.536696281878521e-05, "loss": 0.8289, "step": 832 }, { "epoch": 0.22927131356223768, "grad_norm": 0.401827968549487, "learning_rate": 3.535577386213723e-05, "loss": 0.8329, "step": 833 }, { "epoch": 0.22954654923278056, "grad_norm": 0.43062890667059867, "learning_rate": 3.534457318536921e-05, "loss": 0.813, "step": 834 }, { "epoch": 0.22982178490332347, "grad_norm": 0.38907463282522503, "learning_rate": 3.5333360797029957e-05, "loss": 0.8533, "step": 835 }, { "epoch": 0.23009702057386638, "grad_norm": 0.3466293668642444, "learning_rate": 3.5322136705677186e-05, "loss": 0.8378, "step": 836 }, { "epoch": 0.23037225624440927, "grad_norm": 0.32815307841346353, "learning_rate": 3.531090091987757e-05, "loss": 0.7919, "step": 837 }, { "epoch": 0.23064749191495218, "grad_norm": 0.39176158147379686, "learning_rate": 3.529965344820668e-05, "loss": 0.8272, "step": 838 }, { "epoch": 0.2309227275854951, "grad_norm": 0.396333118129171, "learning_rate": 3.528839429924904e-05, "loss": 0.8145, "step": 839 }, { "epoch": 0.23119796325603797, "grad_norm": 0.41623749051402525, "learning_rate": 3.527712348159805e-05, "loss": 0.8167, "step": 840 }, { "epoch": 0.2314731989265809, "grad_norm": 0.34546793308076407, "learning_rate": 3.526584100385603e-05, "loss": 0.8219, "step": 841 }, { "epoch": 0.2317484345971238, "grad_norm": 0.35318250038626503, "learning_rate": 3.5254546874634226e-05, "loss": 0.8246, "step": 842 }, { "epoch": 0.23202367026766668, "grad_norm": 0.3649066796385545, "learning_rate": 3.524324110255273e-05, "loss": 0.8496, "step": 843 }, { "epoch": 0.2322989059382096, "grad_norm": 0.368191191502741, "learning_rate": 3.5231923696240564e-05, "loss": 0.8199, "step": 844 }, { "epoch": 0.2325741416087525, "grad_norm": 0.40493101571231055, "learning_rate": 3.52205946643356e-05, "loss": 0.7873, "step": 845 }, { "epoch": 0.2328493772792954, "grad_norm": 0.3625293265094342, "learning_rate": 3.520925401548459e-05, "loss": 0.8151, "step": 846 }, { "epoch": 0.2331246129498383, "grad_norm": 0.3600061234109703, "learning_rate": 3.519790175834316e-05, "loss": 0.8514, "step": 847 }, { "epoch": 0.2333998486203812, "grad_norm": 0.39105417860306296, "learning_rate": 3.518653790157579e-05, "loss": 0.8128, "step": 848 }, { "epoch": 0.2336750842909241, "grad_norm": 0.4051778323219665, "learning_rate": 3.517516245385582e-05, "loss": 0.8445, "step": 849 }, { "epoch": 0.233950319961467, "grad_norm": 0.36006749274778344, "learning_rate": 3.5163775423865426e-05, "loss": 0.8328, "step": 850 }, { "epoch": 0.23422555563200992, "grad_norm": 0.4466703831778952, "learning_rate": 3.515237682029563e-05, "loss": 0.8141, "step": 851 }, { "epoch": 0.2345007913025528, "grad_norm": 0.436716336795539, "learning_rate": 3.514096665184628e-05, "loss": 0.807, "step": 852 }, { "epoch": 0.23477602697309571, "grad_norm": 0.38083760181145965, "learning_rate": 3.512954492722607e-05, "loss": 0.8126, "step": 853 }, { "epoch": 0.23505126264363863, "grad_norm": 0.36376687064587027, "learning_rate": 3.5118111655152495e-05, "loss": 0.8255, "step": 854 }, { "epoch": 0.2353264983141815, "grad_norm": 0.40457155368152997, "learning_rate": 3.5106666844351865e-05, "loss": 0.8569, "step": 855 }, { "epoch": 0.23560173398472442, "grad_norm": 0.36572034471340376, "learning_rate": 3.5095210503559315e-05, "loss": 0.848, "step": 856 }, { "epoch": 0.23587696965526733, "grad_norm": 0.36066304336468447, "learning_rate": 3.508374264151876e-05, "loss": 0.833, "step": 857 }, { "epoch": 0.23615220532581022, "grad_norm": 0.40143929438584175, "learning_rate": 3.507226326698291e-05, "loss": 0.8235, "step": 858 }, { "epoch": 0.23642744099635313, "grad_norm": 0.40118410655130093, "learning_rate": 3.506077238871328e-05, "loss": 0.8443, "step": 859 }, { "epoch": 0.23670267666689604, "grad_norm": 0.3873017074806495, "learning_rate": 3.504927001548014e-05, "loss": 0.8456, "step": 860 }, { "epoch": 0.23697791233743892, "grad_norm": 1.1975254182139037, "learning_rate": 3.503775615606255e-05, "loss": 0.8334, "step": 861 }, { "epoch": 0.23725314800798183, "grad_norm": 0.3576509096851949, "learning_rate": 3.502623081924833e-05, "loss": 0.8473, "step": 862 }, { "epoch": 0.23752838367852475, "grad_norm": 0.4197276766222959, "learning_rate": 3.501469401383407e-05, "loss": 0.8473, "step": 863 }, { "epoch": 0.23780361934906763, "grad_norm": 0.40580866166081847, "learning_rate": 3.50031457486251e-05, "loss": 0.8455, "step": 864 }, { "epoch": 0.23807885501961054, "grad_norm": 0.4000441961670813, "learning_rate": 3.499158603243551e-05, "loss": 0.8319, "step": 865 }, { "epoch": 0.23835409069015345, "grad_norm": 0.3720020981698597, "learning_rate": 3.498001487408811e-05, "loss": 0.8258, "step": 866 }, { "epoch": 0.23862932636069634, "grad_norm": 0.46471451586910617, "learning_rate": 3.4968432282414455e-05, "loss": 0.8333, "step": 867 }, { "epoch": 0.23890456203123925, "grad_norm": 0.4924794839189762, "learning_rate": 3.495683826625485e-05, "loss": 0.8488, "step": 868 }, { "epoch": 0.23917979770178216, "grad_norm": 0.4557583818564989, "learning_rate": 3.494523283445826e-05, "loss": 0.8138, "step": 869 }, { "epoch": 0.23945503337232504, "grad_norm": 0.43558174579410125, "learning_rate": 3.493361599588243e-05, "loss": 0.7978, "step": 870 }, { "epoch": 0.23973026904286796, "grad_norm": 0.4093112919080268, "learning_rate": 3.4921987759393755e-05, "loss": 0.8347, "step": 871 }, { "epoch": 0.24000550471341087, "grad_norm": 0.38281761969319905, "learning_rate": 3.491034813386738e-05, "loss": 0.825, "step": 872 }, { "epoch": 0.24028074038395375, "grad_norm": 0.3666315809393925, "learning_rate": 3.489869712818709e-05, "loss": 0.8247, "step": 873 }, { "epoch": 0.24055597605449666, "grad_norm": 0.3871490455459194, "learning_rate": 3.488703475124541e-05, "loss": 0.8278, "step": 874 }, { "epoch": 0.24083121172503957, "grad_norm": 0.4241073969704235, "learning_rate": 3.48753610119435e-05, "loss": 0.8058, "step": 875 }, { "epoch": 0.24110644739558246, "grad_norm": 0.3803506114056611, "learning_rate": 3.486367591919121e-05, "loss": 0.8532, "step": 876 }, { "epoch": 0.24138168306612537, "grad_norm": 0.39639190673605307, "learning_rate": 3.485197948190706e-05, "loss": 0.8368, "step": 877 }, { "epoch": 0.24165691873666828, "grad_norm": 0.38092093936016846, "learning_rate": 3.484027170901822e-05, "loss": 0.8508, "step": 878 }, { "epoch": 0.24193215440721116, "grad_norm": 0.4019989320183119, "learning_rate": 3.482855260946052e-05, "loss": 0.8264, "step": 879 }, { "epoch": 0.24220739007775408, "grad_norm": 0.4237030119016722, "learning_rate": 3.4816822192178415e-05, "loss": 0.8616, "step": 880 }, { "epoch": 0.242482625748297, "grad_norm": 0.35898612975232064, "learning_rate": 3.480508046612502e-05, "loss": 0.7892, "step": 881 }, { "epoch": 0.24275786141883987, "grad_norm": 0.3768587129155393, "learning_rate": 3.479332744026208e-05, "loss": 0.8565, "step": 882 }, { "epoch": 0.24303309708938278, "grad_norm": 0.40306428284909, "learning_rate": 3.478156312355996e-05, "loss": 0.8422, "step": 883 }, { "epoch": 0.2433083327599257, "grad_norm": 0.5132061342859415, "learning_rate": 3.476978752499763e-05, "loss": 0.8377, "step": 884 }, { "epoch": 0.24358356843046858, "grad_norm": 0.3989743114285761, "learning_rate": 3.4758000653562695e-05, "loss": 0.8273, "step": 885 }, { "epoch": 0.2438588041010115, "grad_norm": 0.413623547871739, "learning_rate": 3.4746202518251344e-05, "loss": 0.8266, "step": 886 }, { "epoch": 0.2441340397715544, "grad_norm": 0.4401850552238258, "learning_rate": 3.473439312806836e-05, "loss": 0.8127, "step": 887 }, { "epoch": 0.24440927544209728, "grad_norm": 0.4554287041333923, "learning_rate": 3.4722572492027136e-05, "loss": 0.8554, "step": 888 }, { "epoch": 0.2446845111126402, "grad_norm": 0.4118004994600118, "learning_rate": 3.4710740619149645e-05, "loss": 0.8113, "step": 889 }, { "epoch": 0.2449597467831831, "grad_norm": 0.34307074613438765, "learning_rate": 3.469889751846642e-05, "loss": 0.8459, "step": 890 }, { "epoch": 0.245234982453726, "grad_norm": 0.4452265504756861, "learning_rate": 3.468704319901657e-05, "loss": 0.8261, "step": 891 }, { "epoch": 0.2455102181242689, "grad_norm": 0.4696507396866337, "learning_rate": 3.467517766984778e-05, "loss": 0.8404, "step": 892 }, { "epoch": 0.24578545379481181, "grad_norm": 0.43928661680504655, "learning_rate": 3.466330094001628e-05, "loss": 0.8513, "step": 893 }, { "epoch": 0.2460606894653547, "grad_norm": 0.4342036451462862, "learning_rate": 3.4651413018586844e-05, "loss": 0.809, "step": 894 }, { "epoch": 0.2463359251358976, "grad_norm": 0.4039800086638075, "learning_rate": 3.4639513914632785e-05, "loss": 0.8079, "step": 895 }, { "epoch": 0.24661116080644052, "grad_norm": 0.3917827211289552, "learning_rate": 3.4627603637235966e-05, "loss": 0.8188, "step": 896 }, { "epoch": 0.2468863964769834, "grad_norm": 0.43830843883671355, "learning_rate": 3.461568219548678e-05, "loss": 0.8036, "step": 897 }, { "epoch": 0.24716163214752632, "grad_norm": 0.3466938340677024, "learning_rate": 3.460374959848412e-05, "loss": 0.8094, "step": 898 }, { "epoch": 0.24743686781806923, "grad_norm": 0.37533059414164205, "learning_rate": 3.459180585533542e-05, "loss": 0.8255, "step": 899 }, { "epoch": 0.2477121034886121, "grad_norm": 0.3563580048663711, "learning_rate": 3.457985097515659e-05, "loss": 0.8259, "step": 900 }, { "epoch": 0.24798733915915502, "grad_norm": 0.35500169036390944, "learning_rate": 3.456788496707206e-05, "loss": 0.8323, "step": 901 }, { "epoch": 0.24826257482969794, "grad_norm": 0.3379825574292942, "learning_rate": 3.455590784021476e-05, "loss": 0.8418, "step": 902 }, { "epoch": 0.24853781050024082, "grad_norm": 0.3182222923242929, "learning_rate": 3.454391960372608e-05, "loss": 0.8465, "step": 903 }, { "epoch": 0.24881304617078373, "grad_norm": 0.33768377703810865, "learning_rate": 3.453192026675591e-05, "loss": 0.846, "step": 904 }, { "epoch": 0.24908828184132664, "grad_norm": 0.3540391684873382, "learning_rate": 3.451990983846262e-05, "loss": 0.8757, "step": 905 }, { "epoch": 0.24936351751186953, "grad_norm": 0.33240189216415617, "learning_rate": 3.4507888328013024e-05, "loss": 0.8366, "step": 906 }, { "epoch": 0.24963875318241244, "grad_norm": 0.3925041314170152, "learning_rate": 3.44958557445824e-05, "loss": 0.8302, "step": 907 }, { "epoch": 0.24991398885295535, "grad_norm": 0.33449708063817496, "learning_rate": 3.4483812097354494e-05, "loss": 0.8069, "step": 908 }, { "epoch": 0.25018922452349823, "grad_norm": 0.3406515455017463, "learning_rate": 3.4471757395521465e-05, "loss": 0.8462, "step": 909 }, { "epoch": 0.25046446019404117, "grad_norm": 0.41220154811851184, "learning_rate": 3.445969164828394e-05, "loss": 0.8244, "step": 910 }, { "epoch": 0.25073969586458406, "grad_norm": 0.4371018285501689, "learning_rate": 3.444761486485095e-05, "loss": 0.7883, "step": 911 }, { "epoch": 0.25101493153512694, "grad_norm": 0.43146670222188904, "learning_rate": 3.443552705443998e-05, "loss": 0.8343, "step": 912 }, { "epoch": 0.2512901672056699, "grad_norm": 0.38249638482227416, "learning_rate": 3.442342822627691e-05, "loss": 0.8463, "step": 913 }, { "epoch": 0.25156540287621276, "grad_norm": 0.4243860027202627, "learning_rate": 3.4411318389596026e-05, "loss": 0.8307, "step": 914 }, { "epoch": 0.25184063854675565, "grad_norm": 0.4463803488699344, "learning_rate": 3.4399197553640026e-05, "loss": 0.7895, "step": 915 }, { "epoch": 0.2521158742172986, "grad_norm": 0.44495882971206646, "learning_rate": 3.4387065727660004e-05, "loss": 0.7994, "step": 916 }, { "epoch": 0.25239110988784147, "grad_norm": 0.39566251098871, "learning_rate": 3.437492292091543e-05, "loss": 0.8299, "step": 917 }, { "epoch": 0.25266634555838435, "grad_norm": 0.4045246757106012, "learning_rate": 3.436276914267418e-05, "loss": 0.8511, "step": 918 }, { "epoch": 0.2529415812289273, "grad_norm": 0.42371859497702136, "learning_rate": 3.4350604402212464e-05, "loss": 0.8251, "step": 919 }, { "epoch": 0.2532168168994702, "grad_norm": 0.39492548977263753, "learning_rate": 3.4338428708814903e-05, "loss": 0.8134, "step": 920 }, { "epoch": 0.25349205257001306, "grad_norm": 0.43958568686797717, "learning_rate": 3.432624207177444e-05, "loss": 0.828, "step": 921 }, { "epoch": 0.253767288240556, "grad_norm": 0.4053228077160484, "learning_rate": 3.43140445003924e-05, "loss": 0.8058, "step": 922 }, { "epoch": 0.2540425239110989, "grad_norm": 0.42733786027660337, "learning_rate": 3.430183600397844e-05, "loss": 0.8469, "step": 923 }, { "epoch": 0.25431775958164177, "grad_norm": 0.4254993160770477, "learning_rate": 3.4289616591850545e-05, "loss": 0.8152, "step": 924 }, { "epoch": 0.2545929952521847, "grad_norm": 0.4138441910244267, "learning_rate": 3.427738627333506e-05, "loss": 0.8423, "step": 925 }, { "epoch": 0.2548682309227276, "grad_norm": 0.3546967376757614, "learning_rate": 3.426514505776662e-05, "loss": 0.8135, "step": 926 }, { "epoch": 0.2551434665932705, "grad_norm": 0.39442007367215237, "learning_rate": 3.4252892954488194e-05, "loss": 0.8363, "step": 927 }, { "epoch": 0.2554187022638134, "grad_norm": 0.43492833078367915, "learning_rate": 3.424062997285108e-05, "loss": 0.8282, "step": 928 }, { "epoch": 0.2556939379343563, "grad_norm": 0.39500704849323737, "learning_rate": 3.422835612221484e-05, "loss": 0.8325, "step": 929 }, { "epoch": 0.2559691736048992, "grad_norm": 0.34174578426227653, "learning_rate": 3.421607141194736e-05, "loss": 0.8321, "step": 930 }, { "epoch": 0.2562444092754421, "grad_norm": 0.3850431403355228, "learning_rate": 3.42037758514248e-05, "loss": 0.8874, "step": 931 }, { "epoch": 0.256519644945985, "grad_norm": 0.41994000628992534, "learning_rate": 3.4191469450031615e-05, "loss": 0.837, "step": 932 }, { "epoch": 0.2567948806165279, "grad_norm": 0.42433709518035356, "learning_rate": 3.417915221716052e-05, "loss": 0.8297, "step": 933 }, { "epoch": 0.2570701162870708, "grad_norm": 0.3393954015254882, "learning_rate": 3.416682416221251e-05, "loss": 0.8312, "step": 934 }, { "epoch": 0.2573453519576137, "grad_norm": 0.3839116112214657, "learning_rate": 3.415448529459681e-05, "loss": 0.8374, "step": 935 }, { "epoch": 0.2576205876281566, "grad_norm": 0.44548769644823044, "learning_rate": 3.4142135623730954e-05, "loss": 0.8558, "step": 936 }, { "epoch": 0.25789582329869953, "grad_norm": 0.49983780476601025, "learning_rate": 3.412977515904067e-05, "loss": 0.8126, "step": 937 }, { "epoch": 0.2581710589692424, "grad_norm": 0.4240413807664675, "learning_rate": 3.411740390995994e-05, "loss": 0.8237, "step": 938 }, { "epoch": 0.2584462946397853, "grad_norm": 0.42031699262819905, "learning_rate": 3.410502188593099e-05, "loss": 0.8228, "step": 939 }, { "epoch": 0.25872153031032824, "grad_norm": 0.5173553123655487, "learning_rate": 3.409262909640425e-05, "loss": 0.8218, "step": 940 }, { "epoch": 0.2589967659808711, "grad_norm": 0.4245075055820437, "learning_rate": 3.4080225550838375e-05, "loss": 0.8268, "step": 941 }, { "epoch": 0.259272001651414, "grad_norm": 0.3727650142067033, "learning_rate": 3.4067811258700236e-05, "loss": 0.8258, "step": 942 }, { "epoch": 0.25954723732195695, "grad_norm": 0.5595207476315207, "learning_rate": 3.40553862294649e-05, "loss": 0.8124, "step": 943 }, { "epoch": 0.25982247299249983, "grad_norm": 0.49741572494471603, "learning_rate": 3.4042950472615635e-05, "loss": 0.8276, "step": 944 }, { "epoch": 0.2600977086630427, "grad_norm": 0.48165510019003055, "learning_rate": 3.4030503997643876e-05, "loss": 0.8461, "step": 945 }, { "epoch": 0.26037294433358565, "grad_norm": 0.48855567453007936, "learning_rate": 3.4018046814049265e-05, "loss": 0.8302, "step": 946 }, { "epoch": 0.26064818000412854, "grad_norm": 0.436296244261392, "learning_rate": 3.400557893133961e-05, "loss": 0.8171, "step": 947 }, { "epoch": 0.2609234156746714, "grad_norm": 0.3988161745438226, "learning_rate": 3.399310035903087e-05, "loss": 0.816, "step": 948 }, { "epoch": 0.26119865134521436, "grad_norm": 0.42125979614280584, "learning_rate": 3.398061110664717e-05, "loss": 0.807, "step": 949 }, { "epoch": 0.26147388701575724, "grad_norm": 0.4835863160629807, "learning_rate": 3.3968111183720804e-05, "loss": 0.8311, "step": 950 }, { "epoch": 0.26174912268630013, "grad_norm": 0.42889734579019506, "learning_rate": 3.3955600599792186e-05, "loss": 0.8391, "step": 951 }, { "epoch": 0.26202435835684307, "grad_norm": 0.39552458434404236, "learning_rate": 3.394307936440989e-05, "loss": 0.8301, "step": 952 }, { "epoch": 0.26229959402738595, "grad_norm": 0.41065215014959544, "learning_rate": 3.393054748713059e-05, "loss": 0.8238, "step": 953 }, { "epoch": 0.26257482969792884, "grad_norm": 0.5533985577656855, "learning_rate": 3.391800497751911e-05, "loss": 0.8051, "step": 954 }, { "epoch": 0.2628500653684718, "grad_norm": 0.47953893440947115, "learning_rate": 3.3905451845148375e-05, "loss": 0.8269, "step": 955 }, { "epoch": 0.26312530103901466, "grad_norm": 0.40075344854455874, "learning_rate": 3.3892888099599415e-05, "loss": 0.8513, "step": 956 }, { "epoch": 0.26340053670955754, "grad_norm": 0.4543731521458455, "learning_rate": 3.3880313750461376e-05, "loss": 0.7749, "step": 957 }, { "epoch": 0.2636757723801005, "grad_norm": 0.5196069215957392, "learning_rate": 3.386772880733149e-05, "loss": 0.8211, "step": 958 }, { "epoch": 0.26395100805064337, "grad_norm": 0.45811909805641676, "learning_rate": 3.3855133279815055e-05, "loss": 0.8347, "step": 959 }, { "epoch": 0.26422624372118625, "grad_norm": 0.372973209850614, "learning_rate": 3.3842527177525475e-05, "loss": 0.8385, "step": 960 }, { "epoch": 0.2645014793917292, "grad_norm": 0.44907430913049645, "learning_rate": 3.382991051008422e-05, "loss": 0.8073, "step": 961 }, { "epoch": 0.26477671506227207, "grad_norm": 0.42363534517689194, "learning_rate": 3.381728328712081e-05, "loss": 0.8453, "step": 962 }, { "epoch": 0.26505195073281496, "grad_norm": 0.376662745066105, "learning_rate": 3.3804645518272824e-05, "loss": 0.8403, "step": 963 }, { "epoch": 0.2653271864033579, "grad_norm": 0.49512431710521343, "learning_rate": 3.379199721318591e-05, "loss": 0.8253, "step": 964 }, { "epoch": 0.2656024220739008, "grad_norm": 0.4591187483451386, "learning_rate": 3.377933838151374e-05, "loss": 0.8265, "step": 965 }, { "epoch": 0.26587765774444366, "grad_norm": 0.4601168383862767, "learning_rate": 3.376666903291801e-05, "loss": 0.8297, "step": 966 }, { "epoch": 0.2661528934149866, "grad_norm": 0.3191724881281232, "learning_rate": 3.375398917706847e-05, "loss": 0.8404, "step": 967 }, { "epoch": 0.2664281290855295, "grad_norm": 0.4355995669187843, "learning_rate": 3.3741298823642874e-05, "loss": 0.8265, "step": 968 }, { "epoch": 0.26670336475607237, "grad_norm": 0.4395611618140551, "learning_rate": 3.3728597982326985e-05, "loss": 0.8013, "step": 969 }, { "epoch": 0.2669786004266153, "grad_norm": 0.4021718298563644, "learning_rate": 3.371588666281458e-05, "loss": 0.7941, "step": 970 }, { "epoch": 0.2672538360971582, "grad_norm": 0.37411754108701273, "learning_rate": 3.370316487480743e-05, "loss": 0.7882, "step": 971 }, { "epoch": 0.2675290717677011, "grad_norm": 0.532414476809795, "learning_rate": 3.369043262801529e-05, "loss": 0.8358, "step": 972 }, { "epoch": 0.267804307438244, "grad_norm": 0.45055415041612057, "learning_rate": 3.367768993215591e-05, "loss": 0.8406, "step": 973 }, { "epoch": 0.2680795431087869, "grad_norm": 0.38978013914461174, "learning_rate": 3.3664936796955006e-05, "loss": 0.8379, "step": 974 }, { "epoch": 0.2683547787793298, "grad_norm": 0.41528979439579355, "learning_rate": 3.365217323214626e-05, "loss": 0.829, "step": 975 }, { "epoch": 0.2686300144498727, "grad_norm": 0.46357769975153607, "learning_rate": 3.363939924747132e-05, "loss": 0.8224, "step": 976 }, { "epoch": 0.2689052501204156, "grad_norm": 0.3888950297127714, "learning_rate": 3.362661485267978e-05, "loss": 0.8354, "step": 977 }, { "epoch": 0.2691804857909585, "grad_norm": 0.35372551270193886, "learning_rate": 3.36138200575292e-05, "loss": 0.8172, "step": 978 }, { "epoch": 0.26945572146150143, "grad_norm": 0.3431499868102491, "learning_rate": 3.360101487178504e-05, "loss": 0.797, "step": 979 }, { "epoch": 0.2697309571320443, "grad_norm": 0.40984578702915175, "learning_rate": 3.3588199305220735e-05, "loss": 0.8154, "step": 980 }, { "epoch": 0.2700061928025872, "grad_norm": 0.34466839402337285, "learning_rate": 3.35753733676176e-05, "loss": 0.8384, "step": 981 }, { "epoch": 0.27028142847313014, "grad_norm": 0.3610475270773301, "learning_rate": 3.3562537068764896e-05, "loss": 0.8123, "step": 982 }, { "epoch": 0.270556664143673, "grad_norm": 0.37468376503602213, "learning_rate": 3.354969041845978e-05, "loss": 0.8073, "step": 983 }, { "epoch": 0.2708318998142159, "grad_norm": 0.41702853112269633, "learning_rate": 3.3536833426507324e-05, "loss": 0.8127, "step": 984 }, { "epoch": 0.27110713548475884, "grad_norm": 0.38086610307318625, "learning_rate": 3.3523966102720465e-05, "loss": 0.8237, "step": 985 }, { "epoch": 0.2713823711553017, "grad_norm": 0.3759654765913201, "learning_rate": 3.3511088456920043e-05, "loss": 0.8222, "step": 986 }, { "epoch": 0.2716576068258446, "grad_norm": 0.35873822918453746, "learning_rate": 3.349820049893478e-05, "loss": 0.7961, "step": 987 }, { "epoch": 0.27193284249638755, "grad_norm": 0.41700414745554093, "learning_rate": 3.348530223860127e-05, "loss": 0.7967, "step": 988 }, { "epoch": 0.27220807816693043, "grad_norm": 0.383267708528797, "learning_rate": 3.3472393685763955e-05, "loss": 0.8263, "step": 989 }, { "epoch": 0.2724833138374733, "grad_norm": 0.3613953730606685, "learning_rate": 3.345947485027514e-05, "loss": 0.8469, "step": 990 }, { "epoch": 0.27275854950801626, "grad_norm": 0.4303837046797646, "learning_rate": 3.344654574199499e-05, "loss": 0.8041, "step": 991 }, { "epoch": 0.27303378517855914, "grad_norm": 0.4413659571463197, "learning_rate": 3.343360637079148e-05, "loss": 0.8245, "step": 992 }, { "epoch": 0.273309020849102, "grad_norm": 0.36170362697171604, "learning_rate": 3.342065674654046e-05, "loss": 0.7983, "step": 993 }, { "epoch": 0.27358425651964496, "grad_norm": 0.40823498803765795, "learning_rate": 3.340769687912557e-05, "loss": 0.7897, "step": 994 }, { "epoch": 0.27385949219018785, "grad_norm": 0.45626660403090946, "learning_rate": 3.339472677843829e-05, "loss": 0.8149, "step": 995 }, { "epoch": 0.27413472786073073, "grad_norm": 0.4089685051647268, "learning_rate": 3.33817464543779e-05, "loss": 0.8165, "step": 996 }, { "epoch": 0.27440996353127367, "grad_norm": 0.3690058613659837, "learning_rate": 3.336875591685148e-05, "loss": 0.8301, "step": 997 }, { "epoch": 0.27468519920181655, "grad_norm": 0.4561618026234944, "learning_rate": 3.335575517577391e-05, "loss": 0.8238, "step": 998 }, { "epoch": 0.27496043487235944, "grad_norm": 0.41220364133671056, "learning_rate": 3.334274424106787e-05, "loss": 0.8332, "step": 999 }, { "epoch": 0.2752356705429024, "grad_norm": 0.3625798814772632, "learning_rate": 3.33297231226638e-05, "loss": 0.8407, "step": 1000 }, { "epoch": 0.27551090621344526, "grad_norm": 0.3687920746126404, "learning_rate": 3.331669183049991e-05, "loss": 0.8484, "step": 1001 }, { "epoch": 0.27578614188398815, "grad_norm": 0.4592938229425808, "learning_rate": 3.3303650374522205e-05, "loss": 0.8076, "step": 1002 }, { "epoch": 0.2760613775545311, "grad_norm": 0.4114855454534366, "learning_rate": 3.3290598764684415e-05, "loss": 0.7851, "step": 1003 }, { "epoch": 0.27633661322507397, "grad_norm": 0.3776835579461095, "learning_rate": 3.3277537010948046e-05, "loss": 0.8194, "step": 1004 }, { "epoch": 0.27661184889561685, "grad_norm": 0.32442630714287224, "learning_rate": 3.3264465123282316e-05, "loss": 0.8225, "step": 1005 }, { "epoch": 0.2768870845661598, "grad_norm": 0.4107413621161955, "learning_rate": 3.32513831116642e-05, "loss": 0.8312, "step": 1006 }, { "epoch": 0.2771623202367027, "grad_norm": 0.40158455919786795, "learning_rate": 3.32382909860784e-05, "loss": 0.8025, "step": 1007 }, { "epoch": 0.27743755590724556, "grad_norm": 0.3565030330111457, "learning_rate": 3.322518875651734e-05, "loss": 0.8227, "step": 1008 }, { "epoch": 0.2777127915777885, "grad_norm": 0.38154773925258695, "learning_rate": 3.321207643298113e-05, "loss": 0.8302, "step": 1009 }, { "epoch": 0.2779880272483314, "grad_norm": 0.3500103482441849, "learning_rate": 3.319895402547761e-05, "loss": 0.8266, "step": 1010 }, { "epoch": 0.27826326291887427, "grad_norm": 0.38055832839320536, "learning_rate": 3.318582154402232e-05, "loss": 0.7931, "step": 1011 }, { "epoch": 0.2785384985894172, "grad_norm": 0.4185997398789307, "learning_rate": 3.3172678998638456e-05, "loss": 0.8496, "step": 1012 }, { "epoch": 0.2788137342599601, "grad_norm": 0.35735306715304604, "learning_rate": 3.315952639935692e-05, "loss": 0.8089, "step": 1013 }, { "epoch": 0.279088969930503, "grad_norm": 0.3652762209734979, "learning_rate": 3.314636375621631e-05, "loss": 0.8347, "step": 1014 }, { "epoch": 0.2793642056010459, "grad_norm": 0.35985576189106766, "learning_rate": 3.3133191079262835e-05, "loss": 0.8479, "step": 1015 }, { "epoch": 0.2796394412715888, "grad_norm": 0.3571219737509123, "learning_rate": 3.31200083785504e-05, "loss": 0.8214, "step": 1016 }, { "epoch": 0.2799146769421317, "grad_norm": 0.3665332659456565, "learning_rate": 3.310681566414055e-05, "loss": 0.846, "step": 1017 }, { "epoch": 0.2801899126126746, "grad_norm": 0.5146021670283341, "learning_rate": 3.309361294610249e-05, "loss": 0.8226, "step": 1018 }, { "epoch": 0.2804651482832175, "grad_norm": 0.3832445426516723, "learning_rate": 3.3080400234513014e-05, "loss": 0.8247, "step": 1019 }, { "epoch": 0.2807403839537604, "grad_norm": 0.3671994816011794, "learning_rate": 3.30671775394566e-05, "loss": 0.8047, "step": 1020 }, { "epoch": 0.2810156196243033, "grad_norm": 0.46637880773878493, "learning_rate": 3.305394487102531e-05, "loss": 0.8291, "step": 1021 }, { "epoch": 0.2812908552948462, "grad_norm": 0.4159201543332825, "learning_rate": 3.304070223931883e-05, "loss": 0.8152, "step": 1022 }, { "epoch": 0.2815660909653891, "grad_norm": 0.3993491676615847, "learning_rate": 3.302744965444445e-05, "loss": 0.8258, "step": 1023 }, { "epoch": 0.28184132663593203, "grad_norm": 0.3379334616921281, "learning_rate": 3.3014187126517047e-05, "loss": 0.8262, "step": 1024 }, { "epoch": 0.2821165623064749, "grad_norm": 0.3983835614335685, "learning_rate": 3.3000914665659106e-05, "loss": 0.8327, "step": 1025 }, { "epoch": 0.2823917979770178, "grad_norm": 0.36513531070552024, "learning_rate": 3.298763228200067e-05, "loss": 0.8489, "step": 1026 }, { "epoch": 0.28266703364756074, "grad_norm": 0.37126347427748796, "learning_rate": 3.297433998567938e-05, "loss": 0.8117, "step": 1027 }, { "epoch": 0.2829422693181036, "grad_norm": 0.40646996012559816, "learning_rate": 3.296103778684043e-05, "loss": 0.8359, "step": 1028 }, { "epoch": 0.2832175049886465, "grad_norm": 0.38317328376311643, "learning_rate": 3.294772569563656e-05, "loss": 0.8011, "step": 1029 }, { "epoch": 0.28349274065918945, "grad_norm": 0.3488498488099232, "learning_rate": 3.293440372222808e-05, "loss": 0.8146, "step": 1030 }, { "epoch": 0.28376797632973233, "grad_norm": 0.36648213921116923, "learning_rate": 3.2921071876782824e-05, "loss": 0.8049, "step": 1031 }, { "epoch": 0.2840432120002752, "grad_norm": 0.4262654259718712, "learning_rate": 3.2907730169476194e-05, "loss": 0.7915, "step": 1032 }, { "epoch": 0.28431844767081815, "grad_norm": 0.3832364802699537, "learning_rate": 3.289437861049108e-05, "loss": 0.8358, "step": 1033 }, { "epoch": 0.28459368334136104, "grad_norm": 0.39458916831718355, "learning_rate": 3.288101721001791e-05, "loss": 0.7942, "step": 1034 }, { "epoch": 0.2848689190119039, "grad_norm": 0.4231162507259696, "learning_rate": 3.286764597825463e-05, "loss": 0.7979, "step": 1035 }, { "epoch": 0.28514415468244686, "grad_norm": 0.44727313611900127, "learning_rate": 3.2854264925406666e-05, "loss": 0.8358, "step": 1036 }, { "epoch": 0.28541939035298974, "grad_norm": 0.36572940078160615, "learning_rate": 3.2840874061686965e-05, "loss": 0.8144, "step": 1037 }, { "epoch": 0.2856946260235326, "grad_norm": 0.3857149157332862, "learning_rate": 3.2827473397315945e-05, "loss": 0.8096, "step": 1038 }, { "epoch": 0.28596986169407557, "grad_norm": 0.4044774233752771, "learning_rate": 3.2814062942521524e-05, "loss": 0.8095, "step": 1039 }, { "epoch": 0.28624509736461845, "grad_norm": 0.3990764616275733, "learning_rate": 3.280064270753906e-05, "loss": 0.7967, "step": 1040 }, { "epoch": 0.28652033303516133, "grad_norm": 0.35521758970608697, "learning_rate": 3.278721270261142e-05, "loss": 0.8042, "step": 1041 }, { "epoch": 0.2867955687057043, "grad_norm": 0.30655482545456053, "learning_rate": 3.2773772937988874e-05, "loss": 0.7957, "step": 1042 }, { "epoch": 0.28707080437624716, "grad_norm": 0.37026485436533674, "learning_rate": 3.27603234239292e-05, "loss": 0.8373, "step": 1043 }, { "epoch": 0.28734604004679004, "grad_norm": 0.36786957025880945, "learning_rate": 3.2746864170697554e-05, "loss": 0.812, "step": 1044 }, { "epoch": 0.287621275717333, "grad_norm": 0.3470180637142766, "learning_rate": 3.273339518856658e-05, "loss": 0.8179, "step": 1045 }, { "epoch": 0.28789651138787586, "grad_norm": 0.3756904712576622, "learning_rate": 3.271991648781632e-05, "loss": 0.7933, "step": 1046 }, { "epoch": 0.28817174705841875, "grad_norm": 0.354941020911814, "learning_rate": 3.2706428078734246e-05, "loss": 0.8325, "step": 1047 }, { "epoch": 0.2884469827289617, "grad_norm": 0.33715387407971126, "learning_rate": 3.269292997161522e-05, "loss": 0.8081, "step": 1048 }, { "epoch": 0.28872221839950457, "grad_norm": 0.31762205037148555, "learning_rate": 3.267942217676153e-05, "loss": 0.811, "step": 1049 }, { "epoch": 0.28899745407004745, "grad_norm": 0.29591205358051814, "learning_rate": 3.266590470448284e-05, "loss": 0.8288, "step": 1050 }, { "epoch": 0.2892726897405904, "grad_norm": 0.33213598524888627, "learning_rate": 3.265237756509621e-05, "loss": 0.812, "step": 1051 }, { "epoch": 0.2895479254111333, "grad_norm": 0.32687722628273364, "learning_rate": 3.263884076892608e-05, "loss": 0.7973, "step": 1052 }, { "epoch": 0.28982316108167616, "grad_norm": 0.33189357675975595, "learning_rate": 3.2625294326304255e-05, "loss": 0.8596, "step": 1053 }, { "epoch": 0.2900983967522191, "grad_norm": 0.3245325935760063, "learning_rate": 3.26117382475699e-05, "loss": 0.8156, "step": 1054 }, { "epoch": 0.290373632422762, "grad_norm": 0.3244394751137452, "learning_rate": 3.259817254306953e-05, "loss": 0.8366, "step": 1055 }, { "epoch": 0.29064886809330487, "grad_norm": 0.3563546261904209, "learning_rate": 3.258459722315702e-05, "loss": 0.7998, "step": 1056 }, { "epoch": 0.2909241037638478, "grad_norm": 0.32714568995843457, "learning_rate": 3.257101229819359e-05, "loss": 0.7998, "step": 1057 }, { "epoch": 0.2911993394343907, "grad_norm": 0.3236596736833071, "learning_rate": 3.255741777854778e-05, "loss": 0.8391, "step": 1058 }, { "epoch": 0.2914745751049336, "grad_norm": 0.31755531763025624, "learning_rate": 3.254381367459543e-05, "loss": 0.8079, "step": 1059 }, { "epoch": 0.2917498107754765, "grad_norm": 0.3357666732093731, "learning_rate": 3.2530199996719735e-05, "loss": 0.8483, "step": 1060 }, { "epoch": 0.2920250464460194, "grad_norm": 0.5232287946458585, "learning_rate": 3.251657675531118e-05, "loss": 0.8395, "step": 1061 }, { "epoch": 0.2923002821165623, "grad_norm": 0.33446335510949543, "learning_rate": 3.250294396076755e-05, "loss": 0.8198, "step": 1062 }, { "epoch": 0.2925755177871052, "grad_norm": 0.34175696038594394, "learning_rate": 3.248930162349391e-05, "loss": 0.8364, "step": 1063 }, { "epoch": 0.2928507534576481, "grad_norm": 0.3363356035598911, "learning_rate": 3.247564975390263e-05, "loss": 0.8256, "step": 1064 }, { "epoch": 0.293125989128191, "grad_norm": 0.34158877826390627, "learning_rate": 3.246198836241335e-05, "loss": 0.822, "step": 1065 }, { "epoch": 0.29340122479873393, "grad_norm": 0.3317539657236755, "learning_rate": 3.244831745945295e-05, "loss": 0.8239, "step": 1066 }, { "epoch": 0.2936764604692768, "grad_norm": 0.35703551560031244, "learning_rate": 3.2434637055455603e-05, "loss": 0.825, "step": 1067 }, { "epoch": 0.2939516961398197, "grad_norm": 0.3576830094256862, "learning_rate": 3.242094716086273e-05, "loss": 0.7974, "step": 1068 }, { "epoch": 0.29422693181036264, "grad_norm": 0.32835397231595426, "learning_rate": 3.240724778612298e-05, "loss": 0.8021, "step": 1069 }, { "epoch": 0.2945021674809055, "grad_norm": 0.34593052846467287, "learning_rate": 3.2393538941692245e-05, "loss": 0.827, "step": 1070 }, { "epoch": 0.2947774031514484, "grad_norm": 0.3470889744327635, "learning_rate": 3.237982063803365e-05, "loss": 0.8116, "step": 1071 }, { "epoch": 0.29505263882199134, "grad_norm": 0.32508847968784993, "learning_rate": 3.236609288561753e-05, "loss": 0.7863, "step": 1072 }, { "epoch": 0.2953278744925342, "grad_norm": 0.36213804888615797, "learning_rate": 3.235235569492143e-05, "loss": 0.8061, "step": 1073 }, { "epoch": 0.2956031101630771, "grad_norm": 0.3197503350817932, "learning_rate": 3.2338609076430114e-05, "loss": 0.8305, "step": 1074 }, { "epoch": 0.29587834583362005, "grad_norm": 0.36038210554453803, "learning_rate": 3.232485304063553e-05, "loss": 0.8802, "step": 1075 }, { "epoch": 0.29615358150416293, "grad_norm": 0.328607258126315, "learning_rate": 3.2311087598036825e-05, "loss": 0.8267, "step": 1076 }, { "epoch": 0.2964288171747058, "grad_norm": 0.3640745973298245, "learning_rate": 3.229731275914029e-05, "loss": 0.7978, "step": 1077 }, { "epoch": 0.29670405284524876, "grad_norm": 0.38813330516283573, "learning_rate": 3.2283528534459446e-05, "loss": 0.8318, "step": 1078 }, { "epoch": 0.29697928851579164, "grad_norm": 0.34363745354810477, "learning_rate": 3.2269734934514923e-05, "loss": 0.8253, "step": 1079 }, { "epoch": 0.2972545241863345, "grad_norm": 0.3504791780475102, "learning_rate": 3.2255931969834546e-05, "loss": 0.8036, "step": 1080 }, { "epoch": 0.29752975985687746, "grad_norm": 0.3658093579369949, "learning_rate": 3.224211965095326e-05, "loss": 0.8023, "step": 1081 }, { "epoch": 0.29780499552742035, "grad_norm": 0.38628879778930036, "learning_rate": 3.2228297988413164e-05, "loss": 0.8164, "step": 1082 }, { "epoch": 0.29808023119796323, "grad_norm": 0.32724059178950327, "learning_rate": 3.2214466992763483e-05, "loss": 0.7929, "step": 1083 }, { "epoch": 0.29835546686850617, "grad_norm": 0.38418792709404465, "learning_rate": 3.2200626674560575e-05, "loss": 0.807, "step": 1084 }, { "epoch": 0.29863070253904905, "grad_norm": 0.457456422932918, "learning_rate": 3.2186777044367896e-05, "loss": 0.8199, "step": 1085 }, { "epoch": 0.29890593820959194, "grad_norm": 0.3808994397435877, "learning_rate": 3.217291811275603e-05, "loss": 0.7976, "step": 1086 }, { "epoch": 0.2991811738801349, "grad_norm": 0.3149983883864467, "learning_rate": 3.215904989030263e-05, "loss": 0.8054, "step": 1087 }, { "epoch": 0.29945640955067776, "grad_norm": 0.39142336058549454, "learning_rate": 3.214517238759248e-05, "loss": 0.7858, "step": 1088 }, { "epoch": 0.29973164522122064, "grad_norm": 0.40642371193343135, "learning_rate": 3.213128561521742e-05, "loss": 0.8198, "step": 1089 }, { "epoch": 0.3000068808917636, "grad_norm": 0.38137415650228934, "learning_rate": 3.211738958377637e-05, "loss": 0.8409, "step": 1090 }, { "epoch": 0.30028211656230647, "grad_norm": 0.6588977669498615, "learning_rate": 3.210348430387531e-05, "loss": 0.7886, "step": 1091 }, { "epoch": 0.30055735223284935, "grad_norm": 0.3811754020394223, "learning_rate": 3.2089569786127294e-05, "loss": 0.7909, "step": 1092 }, { "epoch": 0.3008325879033923, "grad_norm": 0.38282563447111506, "learning_rate": 3.207564604115242e-05, "loss": 0.8053, "step": 1093 }, { "epoch": 0.3011078235739352, "grad_norm": 0.375145725571009, "learning_rate": 3.206171307957783e-05, "loss": 0.7704, "step": 1094 }, { "epoch": 0.30138305924447806, "grad_norm": 0.3985383893005983, "learning_rate": 3.2047770912037704e-05, "loss": 0.8437, "step": 1095 }, { "epoch": 0.301658294915021, "grad_norm": 0.3670117256233208, "learning_rate": 3.203381954917323e-05, "loss": 0.7936, "step": 1096 }, { "epoch": 0.3019335305855639, "grad_norm": 0.45355775728194997, "learning_rate": 3.2019859001632635e-05, "loss": 0.8196, "step": 1097 }, { "epoch": 0.30220876625610676, "grad_norm": 0.37933306920483373, "learning_rate": 3.2005889280071154e-05, "loss": 0.8009, "step": 1098 }, { "epoch": 0.3024840019266497, "grad_norm": 0.6413068450592516, "learning_rate": 3.1991910395151e-05, "loss": 0.8376, "step": 1099 }, { "epoch": 0.3027592375971926, "grad_norm": 0.3709682164236483, "learning_rate": 3.1977922357541414e-05, "loss": 0.8061, "step": 1100 }, { "epoch": 0.30303447326773547, "grad_norm": 0.35389360583624657, "learning_rate": 3.196392517791861e-05, "loss": 0.8107, "step": 1101 }, { "epoch": 0.3033097089382784, "grad_norm": 0.30775989343669724, "learning_rate": 3.194991886696575e-05, "loss": 0.8128, "step": 1102 }, { "epoch": 0.3035849446088213, "grad_norm": 0.3676167032966343, "learning_rate": 3.1935903435373026e-05, "loss": 0.8052, "step": 1103 }, { "epoch": 0.3038601802793642, "grad_norm": 0.34230786073322306, "learning_rate": 3.192187889383754e-05, "loss": 0.8067, "step": 1104 }, { "epoch": 0.3041354159499071, "grad_norm": 0.30832081555167196, "learning_rate": 3.190784525306336e-05, "loss": 0.8205, "step": 1105 }, { "epoch": 0.30441065162045, "grad_norm": 0.36304152683383306, "learning_rate": 3.189380252376151e-05, "loss": 0.8069, "step": 1106 }, { "epoch": 0.3046858872909929, "grad_norm": 0.3615993897222199, "learning_rate": 3.187975071664994e-05, "loss": 0.8019, "step": 1107 }, { "epoch": 0.3049611229615358, "grad_norm": 0.32974113357091156, "learning_rate": 3.186568984245354e-05, "loss": 0.8283, "step": 1108 }, { "epoch": 0.3052363586320787, "grad_norm": 0.3238031127924285, "learning_rate": 3.185161991190411e-05, "loss": 0.8033, "step": 1109 }, { "epoch": 0.30551159430262165, "grad_norm": 0.33615745699182764, "learning_rate": 3.183754093574035e-05, "loss": 0.8104, "step": 1110 }, { "epoch": 0.30578682997316453, "grad_norm": 0.36044438230543663, "learning_rate": 3.1823452924707894e-05, "loss": 0.8013, "step": 1111 }, { "epoch": 0.3060620656437074, "grad_norm": 0.5764325505023655, "learning_rate": 3.180935588955926e-05, "loss": 0.7694, "step": 1112 }, { "epoch": 0.30633730131425035, "grad_norm": 0.357333249303477, "learning_rate": 3.179524984105383e-05, "loss": 0.7981, "step": 1113 }, { "epoch": 0.30661253698479324, "grad_norm": 0.3668360038279917, "learning_rate": 3.178113478995791e-05, "loss": 0.8327, "step": 1114 }, { "epoch": 0.3068877726553361, "grad_norm": 0.39086386543844476, "learning_rate": 3.1767010747044635e-05, "loss": 0.8309, "step": 1115 }, { "epoch": 0.30716300832587906, "grad_norm": 0.32648495720119647, "learning_rate": 3.175287772309403e-05, "loss": 0.835, "step": 1116 }, { "epoch": 0.30743824399642194, "grad_norm": 0.8511553863015345, "learning_rate": 3.1738735728892956e-05, "loss": 0.8103, "step": 1117 }, { "epoch": 0.30771347966696483, "grad_norm": 0.39164869160745347, "learning_rate": 3.172458477523514e-05, "loss": 0.814, "step": 1118 }, { "epoch": 0.30798871533750777, "grad_norm": 0.33960980330211993, "learning_rate": 3.1710424872921126e-05, "loss": 0.7888, "step": 1119 }, { "epoch": 0.30826395100805065, "grad_norm": 0.36340957181666206, "learning_rate": 3.1696256032758304e-05, "loss": 0.8154, "step": 1120 }, { "epoch": 0.30853918667859354, "grad_norm": 0.3253130095574143, "learning_rate": 3.168207826556089e-05, "loss": 0.8096, "step": 1121 }, { "epoch": 0.3088144223491365, "grad_norm": 0.34444129036388177, "learning_rate": 3.1667891582149886e-05, "loss": 0.8281, "step": 1122 }, { "epoch": 0.30908965801967936, "grad_norm": 0.34975256000382554, "learning_rate": 3.165369599335312e-05, "loss": 0.8155, "step": 1123 }, { "epoch": 0.30936489369022224, "grad_norm": 0.3793158911943503, "learning_rate": 3.163949151000522e-05, "loss": 0.8448, "step": 1124 }, { "epoch": 0.3096401293607652, "grad_norm": 0.41887201437302796, "learning_rate": 3.162527814294761e-05, "loss": 0.8345, "step": 1125 }, { "epoch": 0.30991536503130807, "grad_norm": 0.3303807453135232, "learning_rate": 3.161105590302845e-05, "loss": 0.8057, "step": 1126 }, { "epoch": 0.31019060070185095, "grad_norm": 0.4215554810706794, "learning_rate": 3.159682480110273e-05, "loss": 0.8199, "step": 1127 }, { "epoch": 0.3104658363723939, "grad_norm": 0.3410676829644901, "learning_rate": 3.158258484803216e-05, "loss": 0.7984, "step": 1128 }, { "epoch": 0.3107410720429368, "grad_norm": 0.3154713819491548, "learning_rate": 3.156833605468523e-05, "loss": 0.7947, "step": 1129 }, { "epoch": 0.31101630771347966, "grad_norm": 0.3357186700692159, "learning_rate": 3.1554078431937184e-05, "loss": 0.7811, "step": 1130 }, { "epoch": 0.3112915433840226, "grad_norm": 0.2964859029925219, "learning_rate": 3.153981199066996e-05, "loss": 0.8289, "step": 1131 }, { "epoch": 0.3115667790545655, "grad_norm": 0.32597761114967777, "learning_rate": 3.152553674177227e-05, "loss": 0.8222, "step": 1132 }, { "epoch": 0.31184201472510836, "grad_norm": 0.3353452009647141, "learning_rate": 3.151125269613955e-05, "loss": 0.7971, "step": 1133 }, { "epoch": 0.3121172503956513, "grad_norm": 0.32858560039767065, "learning_rate": 3.1496959864673914e-05, "loss": 0.8003, "step": 1134 }, { "epoch": 0.3123924860661942, "grad_norm": 0.3157082619075387, "learning_rate": 3.148265825828422e-05, "loss": 0.8215, "step": 1135 }, { "epoch": 0.31266772173673707, "grad_norm": 0.3101601481876492, "learning_rate": 3.1468347887886004e-05, "loss": 0.8126, "step": 1136 }, { "epoch": 0.31294295740728, "grad_norm": 0.3933483214188846, "learning_rate": 3.145402876440148e-05, "loss": 0.7987, "step": 1137 }, { "epoch": 0.3132181930778229, "grad_norm": 0.31823140928731836, "learning_rate": 3.1439700898759565e-05, "loss": 0.8061, "step": 1138 }, { "epoch": 0.3134934287483658, "grad_norm": 0.305547483255719, "learning_rate": 3.142536430189585e-05, "loss": 0.7949, "step": 1139 }, { "epoch": 0.3137686644189087, "grad_norm": 0.34424414664507813, "learning_rate": 3.141101898475257e-05, "loss": 0.8018, "step": 1140 }, { "epoch": 0.3140439000894516, "grad_norm": 0.360493950749892, "learning_rate": 3.1396664958278614e-05, "loss": 0.8444, "step": 1141 }, { "epoch": 0.3143191357599945, "grad_norm": 0.3105624439412429, "learning_rate": 3.138230223342955e-05, "loss": 0.7923, "step": 1142 }, { "epoch": 0.3145943714305374, "grad_norm": 0.33707412890930577, "learning_rate": 3.136793082116756e-05, "loss": 0.8507, "step": 1143 }, { "epoch": 0.3148696071010803, "grad_norm": 0.3196651333111948, "learning_rate": 3.135355073246146e-05, "loss": 0.8353, "step": 1144 }, { "epoch": 0.3151448427716232, "grad_norm": 0.30958632526199326, "learning_rate": 3.133916197828668e-05, "loss": 0.8093, "step": 1145 }, { "epoch": 0.31542007844216613, "grad_norm": 0.3480763284379594, "learning_rate": 3.132476456962528e-05, "loss": 0.8423, "step": 1146 }, { "epoch": 0.315695314112709, "grad_norm": 0.33953480338161784, "learning_rate": 3.131035851746592e-05, "loss": 0.8248, "step": 1147 }, { "epoch": 0.3159705497832519, "grad_norm": 0.3453385914088088, "learning_rate": 3.129594383280386e-05, "loss": 0.7956, "step": 1148 }, { "epoch": 0.31624578545379484, "grad_norm": 0.3771287825833907, "learning_rate": 3.1281520526640936e-05, "loss": 0.8335, "step": 1149 }, { "epoch": 0.3165210211243377, "grad_norm": 0.3368884133886741, "learning_rate": 3.126708860998557e-05, "loss": 0.818, "step": 1150 }, { "epoch": 0.3167962567948806, "grad_norm": 0.3303278124290784, "learning_rate": 3.125264809385278e-05, "loss": 0.8042, "step": 1151 }, { "epoch": 0.31707149246542354, "grad_norm": 0.4145796934735968, "learning_rate": 3.1238198989264094e-05, "loss": 0.8208, "step": 1152 }, { "epoch": 0.3173467281359664, "grad_norm": 0.35755342074383434, "learning_rate": 3.122374130724765e-05, "loss": 0.8246, "step": 1153 }, { "epoch": 0.3176219638065093, "grad_norm": 0.34738534531203147, "learning_rate": 3.1209275058838105e-05, "loss": 0.8167, "step": 1154 }, { "epoch": 0.31789719947705225, "grad_norm": 0.3211440831630971, "learning_rate": 3.119480025507665e-05, "loss": 0.8181, "step": 1155 }, { "epoch": 0.31817243514759513, "grad_norm": 0.3756600592518461, "learning_rate": 3.1180316907011026e-05, "loss": 0.8246, "step": 1156 }, { "epoch": 0.318447670818138, "grad_norm": 0.3717852513165001, "learning_rate": 3.1165825025695484e-05, "loss": 0.8155, "step": 1157 }, { "epoch": 0.31872290648868096, "grad_norm": 0.3411660067212517, "learning_rate": 3.1151324622190776e-05, "loss": 0.8365, "step": 1158 }, { "epoch": 0.31899814215922384, "grad_norm": 0.35723796605780694, "learning_rate": 3.113681570756417e-05, "loss": 0.8077, "step": 1159 }, { "epoch": 0.3192733778297667, "grad_norm": 0.37898335129664723, "learning_rate": 3.112229829288946e-05, "loss": 0.8076, "step": 1160 }, { "epoch": 0.31954861350030966, "grad_norm": 0.3842451487822477, "learning_rate": 3.110777238924685e-05, "loss": 0.8018, "step": 1161 }, { "epoch": 0.31982384917085255, "grad_norm": 0.34774442510600123, "learning_rate": 3.109323800772312e-05, "loss": 0.8287, "step": 1162 }, { "epoch": 0.32009908484139543, "grad_norm": 0.321811757823895, "learning_rate": 3.1078695159411435e-05, "loss": 0.7819, "step": 1163 }, { "epoch": 0.32037432051193837, "grad_norm": 0.31062206597350395, "learning_rate": 3.106414385541147e-05, "loss": 0.7771, "step": 1164 }, { "epoch": 0.32064955618248125, "grad_norm": 0.3462157661062194, "learning_rate": 3.104958410682935e-05, "loss": 0.8109, "step": 1165 }, { "epoch": 0.32092479185302414, "grad_norm": 0.3225870300013751, "learning_rate": 3.1035015924777634e-05, "loss": 0.8416, "step": 1166 }, { "epoch": 0.3212000275235671, "grad_norm": 0.2908050643911587, "learning_rate": 3.102043932037532e-05, "loss": 0.8122, "step": 1167 }, { "epoch": 0.32147526319410996, "grad_norm": 0.32006337424346826, "learning_rate": 3.1005854304747826e-05, "loss": 0.852, "step": 1168 }, { "epoch": 0.32175049886465285, "grad_norm": 0.32418150253371214, "learning_rate": 3.0991260889027025e-05, "loss": 0.7922, "step": 1169 }, { "epoch": 0.3220257345351958, "grad_norm": 0.3270598384344112, "learning_rate": 3.097665908435115e-05, "loss": 0.7983, "step": 1170 }, { "epoch": 0.32230097020573867, "grad_norm": 0.3285607660817388, "learning_rate": 3.096204890186488e-05, "loss": 0.8012, "step": 1171 }, { "epoch": 0.32257620587628155, "grad_norm": 0.3270770971277959, "learning_rate": 3.0947430352719254e-05, "loss": 0.8058, "step": 1172 }, { "epoch": 0.3228514415468245, "grad_norm": 0.30266960130770704, "learning_rate": 3.0932803448071726e-05, "loss": 0.7792, "step": 1173 }, { "epoch": 0.3231266772173674, "grad_norm": 0.29525543430326306, "learning_rate": 3.091816819908611e-05, "loss": 0.8084, "step": 1174 }, { "epoch": 0.32340191288791026, "grad_norm": 0.2949338073321175, "learning_rate": 3.0903524616932604e-05, "loss": 0.8111, "step": 1175 }, { "epoch": 0.3236771485584532, "grad_norm": 2.409665762699941, "learning_rate": 3.0888872712787744e-05, "loss": 0.8098, "step": 1176 }, { "epoch": 0.3239523842289961, "grad_norm": 0.34556104275883914, "learning_rate": 3.0874212497834436e-05, "loss": 0.7965, "step": 1177 }, { "epoch": 0.32422761989953897, "grad_norm": 0.33400503870094994, "learning_rate": 3.0859543983261916e-05, "loss": 0.8097, "step": 1178 }, { "epoch": 0.3245028555700819, "grad_norm": 0.3159377875922141, "learning_rate": 3.0844867180265765e-05, "loss": 0.8028, "step": 1179 }, { "epoch": 0.3247780912406248, "grad_norm": 0.34527808475244703, "learning_rate": 3.083018210004789e-05, "loss": 0.7971, "step": 1180 }, { "epoch": 0.3250533269111677, "grad_norm": 0.3571175987053503, "learning_rate": 3.08154887538165e-05, "loss": 0.7921, "step": 1181 }, { "epoch": 0.3253285625817106, "grad_norm": 0.3430153890288714, "learning_rate": 3.080078715278614e-05, "loss": 0.7938, "step": 1182 }, { "epoch": 0.3256037982522535, "grad_norm": 0.317884097887881, "learning_rate": 3.078607730817763e-05, "loss": 0.7941, "step": 1183 }, { "epoch": 0.3258790339227964, "grad_norm": 0.33993810766485266, "learning_rate": 3.077135923121809e-05, "loss": 0.8235, "step": 1184 }, { "epoch": 0.3261542695933393, "grad_norm": 0.34211246205653845, "learning_rate": 3.075663293314093e-05, "loss": 0.86, "step": 1185 }, { "epoch": 0.3264295052638822, "grad_norm": 0.35413291917176803, "learning_rate": 3.074189842518584e-05, "loss": 0.7843, "step": 1186 }, { "epoch": 0.3267047409344251, "grad_norm": 0.3586716027990718, "learning_rate": 3.072715571859874e-05, "loss": 0.7954, "step": 1187 }, { "epoch": 0.326979976604968, "grad_norm": 0.32160005633334726, "learning_rate": 3.071240482463186e-05, "loss": 0.7991, "step": 1188 }, { "epoch": 0.3272552122755109, "grad_norm": 0.36521159572448836, "learning_rate": 3.0697645754543636e-05, "loss": 0.8058, "step": 1189 }, { "epoch": 0.3275304479460538, "grad_norm": 0.331018434067738, "learning_rate": 3.068287851959877e-05, "loss": 0.8261, "step": 1190 }, { "epoch": 0.32780568361659673, "grad_norm": 0.37362418936014585, "learning_rate": 3.066810313106818e-05, "loss": 0.8238, "step": 1191 }, { "epoch": 0.3280809192871396, "grad_norm": 0.40993410650019435, "learning_rate": 3.0653319600229e-05, "loss": 0.8012, "step": 1192 }, { "epoch": 0.3283561549576825, "grad_norm": 0.3378351823805291, "learning_rate": 3.063852793836462e-05, "loss": 0.8327, "step": 1193 }, { "epoch": 0.32863139062822544, "grad_norm": 0.3673208048610507, "learning_rate": 3.062372815676461e-05, "loss": 0.8315, "step": 1194 }, { "epoch": 0.3289066262987683, "grad_norm": 0.3884739790830478, "learning_rate": 3.06089202667247e-05, "loss": 0.7938, "step": 1195 }, { "epoch": 0.3291818619693112, "grad_norm": 0.34186931524257896, "learning_rate": 3.059410427954687e-05, "loss": 0.7876, "step": 1196 }, { "epoch": 0.32945709763985415, "grad_norm": 0.32303616322219825, "learning_rate": 3.057928020653925e-05, "loss": 0.8208, "step": 1197 }, { "epoch": 0.32973233331039703, "grad_norm": 0.35967963521533275, "learning_rate": 3.056444805901615e-05, "loss": 0.8186, "step": 1198 }, { "epoch": 0.3300075689809399, "grad_norm": 0.3594452352052698, "learning_rate": 3.0549607848298024e-05, "loss": 0.8048, "step": 1199 }, { "epoch": 0.33028280465148285, "grad_norm": 0.32815987567995447, "learning_rate": 3.0534759585711505e-05, "loss": 0.8301, "step": 1200 }, { "epoch": 0.33055804032202574, "grad_norm": 0.3323415741018949, "learning_rate": 3.0519903282589355e-05, "loss": 0.8312, "step": 1201 }, { "epoch": 0.3308332759925686, "grad_norm": 0.3596182289098799, "learning_rate": 3.0505038950270482e-05, "loss": 0.815, "step": 1202 }, { "epoch": 0.33110851166311156, "grad_norm": 0.3602247707048881, "learning_rate": 3.049016660009992e-05, "loss": 0.7734, "step": 1203 }, { "epoch": 0.33138374733365444, "grad_norm": 0.31273028302021144, "learning_rate": 3.0475286243428824e-05, "loss": 0.8322, "step": 1204 }, { "epoch": 0.3316589830041973, "grad_norm": 0.36781142044735826, "learning_rate": 3.0460397891614452e-05, "loss": 0.8127, "step": 1205 }, { "epoch": 0.33193421867474027, "grad_norm": 0.39408172254577667, "learning_rate": 3.044550155602017e-05, "loss": 0.8256, "step": 1206 }, { "epoch": 0.33220945434528315, "grad_norm": 0.3139660280815443, "learning_rate": 3.043059724801544e-05, "loss": 0.7946, "step": 1207 }, { "epoch": 0.33248469001582603, "grad_norm": 0.314800186230482, "learning_rate": 3.0415684978975802e-05, "loss": 0.8146, "step": 1208 }, { "epoch": 0.332759925686369, "grad_norm": 0.30046207121652113, "learning_rate": 3.0400764760282872e-05, "loss": 0.8208, "step": 1209 }, { "epoch": 0.33303516135691186, "grad_norm": 0.3533154546351518, "learning_rate": 3.0385836603324348e-05, "loss": 0.8022, "step": 1210 }, { "epoch": 0.33331039702745474, "grad_norm": 0.3108939167879381, "learning_rate": 3.037090051949397e-05, "loss": 0.7982, "step": 1211 }, { "epoch": 0.3335856326979977, "grad_norm": 0.307651020086293, "learning_rate": 3.0355956520191544e-05, "loss": 0.8243, "step": 1212 }, { "epoch": 0.33386086836854056, "grad_norm": 0.32257166738075094, "learning_rate": 3.0341004616822888e-05, "loss": 0.82, "step": 1213 }, { "epoch": 0.33413610403908345, "grad_norm": 0.3251066300297621, "learning_rate": 3.0326044820799887e-05, "loss": 0.8236, "step": 1214 }, { "epoch": 0.3344113397096264, "grad_norm": 0.37688942249238155, "learning_rate": 3.031107714354044e-05, "loss": 0.8055, "step": 1215 }, { "epoch": 0.33468657538016927, "grad_norm": 0.34475380933433564, "learning_rate": 3.0296101596468444e-05, "loss": 0.8088, "step": 1216 }, { "epoch": 0.33496181105071215, "grad_norm": 0.332618427949028, "learning_rate": 3.0281118191013817e-05, "loss": 0.7932, "step": 1217 }, { "epoch": 0.3352370467212551, "grad_norm": 0.35575743631868867, "learning_rate": 3.026612693861248e-05, "loss": 0.7902, "step": 1218 }, { "epoch": 0.335512282391798, "grad_norm": 0.31644648344381754, "learning_rate": 3.0251127850706332e-05, "loss": 0.8479, "step": 1219 }, { "epoch": 0.33578751806234086, "grad_norm": 0.3267765317990865, "learning_rate": 3.0236120938743256e-05, "loss": 0.8139, "step": 1220 }, { "epoch": 0.3360627537328838, "grad_norm": 0.3335729983083016, "learning_rate": 3.022110621417711e-05, "loss": 0.8171, "step": 1221 }, { "epoch": 0.3363379894034267, "grad_norm": 0.3071777559635056, "learning_rate": 3.0206083688467714e-05, "loss": 0.8428, "step": 1222 }, { "epoch": 0.33661322507396957, "grad_norm": 0.31898196862856226, "learning_rate": 3.0191053373080836e-05, "loss": 0.7964, "step": 1223 }, { "epoch": 0.3368884607445125, "grad_norm": 0.3322197682551663, "learning_rate": 3.0176015279488192e-05, "loss": 0.824, "step": 1224 }, { "epoch": 0.3371636964150554, "grad_norm": 0.3106140921777173, "learning_rate": 3.016096941916743e-05, "loss": 0.8096, "step": 1225 }, { "epoch": 0.3374389320855983, "grad_norm": 0.3217879380809505, "learning_rate": 3.014591580360215e-05, "loss": 0.7939, "step": 1226 }, { "epoch": 0.3377141677561412, "grad_norm": 0.34349030381089224, "learning_rate": 3.0130854444281836e-05, "loss": 0.8313, "step": 1227 }, { "epoch": 0.3379894034266841, "grad_norm": 0.31334713881153914, "learning_rate": 3.011578535270192e-05, "loss": 0.7933, "step": 1228 }, { "epoch": 0.338264639097227, "grad_norm": 0.322590390034193, "learning_rate": 3.0100708540363693e-05, "loss": 0.7951, "step": 1229 }, { "epoch": 0.3385398747677699, "grad_norm": 0.3220268508502108, "learning_rate": 3.0085624018774368e-05, "loss": 0.8019, "step": 1230 }, { "epoch": 0.3388151104383128, "grad_norm": 0.33786748372948533, "learning_rate": 3.0070531799447037e-05, "loss": 0.7967, "step": 1231 }, { "epoch": 0.3390903461088557, "grad_norm": 0.3232810238187581, "learning_rate": 3.0055431893900668e-05, "loss": 0.7889, "step": 1232 }, { "epoch": 0.33936558177939863, "grad_norm": 0.31511447249732616, "learning_rate": 3.0040324313660095e-05, "loss": 0.819, "step": 1233 }, { "epoch": 0.3396408174499415, "grad_norm": 0.32508515061520127, "learning_rate": 3.002520907025599e-05, "loss": 0.8422, "step": 1234 }, { "epoch": 0.3399160531204844, "grad_norm": 0.3367511206875014, "learning_rate": 3.0010086175224904e-05, "loss": 0.8127, "step": 1235 }, { "epoch": 0.34019128879102734, "grad_norm": 0.3197635030197052, "learning_rate": 2.9994955640109212e-05, "loss": 0.8557, "step": 1236 }, { "epoch": 0.3404665244615702, "grad_norm": 0.3329731739747747, "learning_rate": 2.9979817476457134e-05, "loss": 0.8161, "step": 1237 }, { "epoch": 0.3407417601321131, "grad_norm": 0.3805828506629627, "learning_rate": 2.996467169582268e-05, "loss": 0.8104, "step": 1238 }, { "epoch": 0.34101699580265604, "grad_norm": 0.29477204305043675, "learning_rate": 2.9949518309765716e-05, "loss": 0.8476, "step": 1239 }, { "epoch": 0.3412922314731989, "grad_norm": 0.3298880720437482, "learning_rate": 2.9934357329851873e-05, "loss": 0.8129, "step": 1240 }, { "epoch": 0.3415674671437418, "grad_norm": 0.2971338055346614, "learning_rate": 2.9919188767652615e-05, "loss": 0.8022, "step": 1241 }, { "epoch": 0.34184270281428475, "grad_norm": 0.38020295401976684, "learning_rate": 2.9904012634745155e-05, "loss": 0.8616, "step": 1242 }, { "epoch": 0.34211793848482763, "grad_norm": 0.33471548909158555, "learning_rate": 2.9888828942712526e-05, "loss": 0.796, "step": 1243 }, { "epoch": 0.3423931741553705, "grad_norm": 0.3140747511717884, "learning_rate": 2.9873637703143496e-05, "loss": 0.8197, "step": 1244 }, { "epoch": 0.34266840982591346, "grad_norm": 0.34789922438101994, "learning_rate": 2.9858438927632604e-05, "loss": 0.8057, "step": 1245 }, { "epoch": 0.34294364549645634, "grad_norm": 0.3500323011235929, "learning_rate": 2.9843232627780146e-05, "loss": 0.8288, "step": 1246 }, { "epoch": 0.3432188811669992, "grad_norm": 0.3518209494114577, "learning_rate": 2.9828018815192165e-05, "loss": 0.8365, "step": 1247 }, { "epoch": 0.34349411683754216, "grad_norm": 0.369605092665537, "learning_rate": 2.981279750148042e-05, "loss": 0.8176, "step": 1248 }, { "epoch": 0.34376935250808505, "grad_norm": 0.3435550533667174, "learning_rate": 2.9797568698262408e-05, "loss": 0.8077, "step": 1249 }, { "epoch": 0.34404458817862793, "grad_norm": 0.32694296796070815, "learning_rate": 2.9782332417161347e-05, "loss": 0.7941, "step": 1250 }, { "epoch": 0.34431982384917087, "grad_norm": 0.3236069373962924, "learning_rate": 2.9767088669806145e-05, "loss": 0.7937, "step": 1251 }, { "epoch": 0.34459505951971375, "grad_norm": 0.31080488888524316, "learning_rate": 2.9751837467831425e-05, "loss": 0.7979, "step": 1252 }, { "epoch": 0.34487029519025664, "grad_norm": 0.5999440469186542, "learning_rate": 2.9736578822877494e-05, "loss": 0.794, "step": 1253 }, { "epoch": 0.3451455308607996, "grad_norm": 0.35854535780548275, "learning_rate": 2.9721312746590346e-05, "loss": 0.7946, "step": 1254 }, { "epoch": 0.34542076653134246, "grad_norm": 0.36589362982369955, "learning_rate": 2.9706039250621626e-05, "loss": 0.7959, "step": 1255 }, { "epoch": 0.34569600220188534, "grad_norm": 0.2922692314970786, "learning_rate": 2.9690758346628663e-05, "loss": 0.8008, "step": 1256 }, { "epoch": 0.3459712378724283, "grad_norm": 0.3478361453792954, "learning_rate": 2.9675470046274432e-05, "loss": 0.8221, "step": 1257 }, { "epoch": 0.34624647354297117, "grad_norm": 0.3756605850220491, "learning_rate": 2.966017436122756e-05, "loss": 0.8077, "step": 1258 }, { "epoch": 0.34652170921351405, "grad_norm": 0.3383026121017178, "learning_rate": 2.9644871303162303e-05, "loss": 0.7974, "step": 1259 }, { "epoch": 0.346796944884057, "grad_norm": 0.3072858247518557, "learning_rate": 2.9629560883758547e-05, "loss": 0.7879, "step": 1260 }, { "epoch": 0.3470721805545999, "grad_norm": 0.34912142348807734, "learning_rate": 2.9614243114701793e-05, "loss": 0.8135, "step": 1261 }, { "epoch": 0.34734741622514276, "grad_norm": 0.37479003634737135, "learning_rate": 2.959891800768315e-05, "loss": 0.7965, "step": 1262 }, { "epoch": 0.3476226518956857, "grad_norm": 0.28982446000602097, "learning_rate": 2.9583585574399335e-05, "loss": 0.8059, "step": 1263 }, { "epoch": 0.3478978875662286, "grad_norm": 0.3469104066143551, "learning_rate": 2.9568245826552662e-05, "loss": 0.7957, "step": 1264 }, { "epoch": 0.34817312323677146, "grad_norm": 0.35034756184161747, "learning_rate": 2.9552898775851013e-05, "loss": 0.7733, "step": 1265 }, { "epoch": 0.3484483589073144, "grad_norm": 0.3216610530824517, "learning_rate": 2.9537544434007844e-05, "loss": 0.7871, "step": 1266 }, { "epoch": 0.3487235945778573, "grad_norm": 0.34038548926716644, "learning_rate": 2.9522182812742195e-05, "loss": 0.8159, "step": 1267 }, { "epoch": 0.34899883024840017, "grad_norm": 0.3129022395956265, "learning_rate": 2.9506813923778637e-05, "loss": 0.8493, "step": 1268 }, { "epoch": 0.3492740659189431, "grad_norm": 0.35733246998396123, "learning_rate": 2.9491437778847305e-05, "loss": 0.7921, "step": 1269 }, { "epoch": 0.349549301589486, "grad_norm": 0.4164933167458972, "learning_rate": 2.9476054389683865e-05, "loss": 0.8324, "step": 1270 }, { "epoch": 0.3498245372600289, "grad_norm": 0.31619142772383657, "learning_rate": 2.9460663768029523e-05, "loss": 0.7869, "step": 1271 }, { "epoch": 0.3500997729305718, "grad_norm": 0.3978250102523207, "learning_rate": 2.944526592563099e-05, "loss": 0.7979, "step": 1272 }, { "epoch": 0.3503750086011147, "grad_norm": 0.38105837046872004, "learning_rate": 2.9429860874240487e-05, "loss": 0.8504, "step": 1273 }, { "epoch": 0.3506502442716576, "grad_norm": 0.451797702235112, "learning_rate": 2.941444862561575e-05, "loss": 0.8174, "step": 1274 }, { "epoch": 0.3509254799422005, "grad_norm": 0.45694558929714885, "learning_rate": 2.939902919152001e-05, "loss": 0.8196, "step": 1275 }, { "epoch": 0.3512007156127434, "grad_norm": 0.38279750572095345, "learning_rate": 2.938360258372197e-05, "loss": 0.8099, "step": 1276 }, { "epoch": 0.3514759512832863, "grad_norm": 0.3679145053121925, "learning_rate": 2.9368168813995806e-05, "loss": 0.8013, "step": 1277 }, { "epoch": 0.35175118695382923, "grad_norm": 0.4157338308427359, "learning_rate": 2.9352727894121177e-05, "loss": 0.8227, "step": 1278 }, { "epoch": 0.3520264226243721, "grad_norm": 0.38644287527604376, "learning_rate": 2.9337279835883182e-05, "loss": 0.8048, "step": 1279 }, { "epoch": 0.352301658294915, "grad_norm": 0.38957270430092533, "learning_rate": 2.9321824651072387e-05, "loss": 0.7748, "step": 1280 }, { "epoch": 0.35257689396545794, "grad_norm": 0.3643444602815919, "learning_rate": 2.9306362351484775e-05, "loss": 0.8333, "step": 1281 }, { "epoch": 0.3528521296360008, "grad_norm": 0.3937275576769181, "learning_rate": 2.9290892948921784e-05, "loss": 0.7821, "step": 1282 }, { "epoch": 0.3531273653065437, "grad_norm": 0.4223826002667787, "learning_rate": 2.927541645519024e-05, "loss": 0.7973, "step": 1283 }, { "epoch": 0.35340260097708664, "grad_norm": 0.3636893447079271, "learning_rate": 2.9259932882102417e-05, "loss": 0.8181, "step": 1284 }, { "epoch": 0.35367783664762953, "grad_norm": 0.41412924101092213, "learning_rate": 2.924444224147597e-05, "loss": 0.8136, "step": 1285 }, { "epoch": 0.3539530723181724, "grad_norm": 0.36713321160115875, "learning_rate": 2.9228944545133963e-05, "loss": 0.8078, "step": 1286 }, { "epoch": 0.35422830798871535, "grad_norm": 0.3414686821209659, "learning_rate": 2.9213439804904826e-05, "loss": 0.8066, "step": 1287 }, { "epoch": 0.35450354365925824, "grad_norm": 0.3906568751906246, "learning_rate": 2.9197928032622377e-05, "loss": 0.7955, "step": 1288 }, { "epoch": 0.3547787793298011, "grad_norm": 0.38845030866559555, "learning_rate": 2.91824092401258e-05, "loss": 0.8167, "step": 1289 }, { "epoch": 0.35505401500034406, "grad_norm": 0.3802361599602513, "learning_rate": 2.916688343925965e-05, "loss": 0.8086, "step": 1290 }, { "epoch": 0.35532925067088694, "grad_norm": 0.3688825519695599, "learning_rate": 2.91513506418738e-05, "loss": 0.8437, "step": 1291 }, { "epoch": 0.3556044863414298, "grad_norm": 0.377317143026439, "learning_rate": 2.913581085982349e-05, "loss": 0.8203, "step": 1292 }, { "epoch": 0.35587972201197277, "grad_norm": 0.36144713130927403, "learning_rate": 2.912026410496929e-05, "loss": 0.7908, "step": 1293 }, { "epoch": 0.35615495768251565, "grad_norm": 0.3166623645159551, "learning_rate": 2.910471038917707e-05, "loss": 0.817, "step": 1294 }, { "epoch": 0.35643019335305853, "grad_norm": 0.3659223836396834, "learning_rate": 2.9089149724318026e-05, "loss": 0.8106, "step": 1295 }, { "epoch": 0.3567054290236015, "grad_norm": 0.34589936320896875, "learning_rate": 2.9073582122268677e-05, "loss": 0.8201, "step": 1296 }, { "epoch": 0.35698066469414436, "grad_norm": 0.339484705019932, "learning_rate": 2.9058007594910803e-05, "loss": 0.8258, "step": 1297 }, { "epoch": 0.35725590036468724, "grad_norm": 0.3230526376260753, "learning_rate": 2.904242615413149e-05, "loss": 0.8288, "step": 1298 }, { "epoch": 0.3575311360352302, "grad_norm": 0.3177306538128517, "learning_rate": 2.902683781182309e-05, "loss": 0.823, "step": 1299 }, { "epoch": 0.35780637170577306, "grad_norm": 0.3379774983757399, "learning_rate": 2.9011242579883237e-05, "loss": 0.8071, "step": 1300 }, { "epoch": 0.35808160737631595, "grad_norm": 0.28028830011157246, "learning_rate": 2.899564047021481e-05, "loss": 0.7855, "step": 1301 }, { "epoch": 0.3583568430468589, "grad_norm": 0.3160256126255414, "learning_rate": 2.898003149472594e-05, "loss": 0.8253, "step": 1302 }, { "epoch": 0.35863207871740177, "grad_norm": 0.2812495085931235, "learning_rate": 2.8964415665330005e-05, "loss": 0.783, "step": 1303 }, { "epoch": 0.35890731438794465, "grad_norm": 0.3304258342718014, "learning_rate": 2.8948792993945612e-05, "loss": 0.8093, "step": 1304 }, { "epoch": 0.3591825500584876, "grad_norm": 0.3036447483986763, "learning_rate": 2.893316349249658e-05, "loss": 0.8194, "step": 1305 }, { "epoch": 0.3594577857290305, "grad_norm": 0.3009727919107401, "learning_rate": 2.891752717291195e-05, "loss": 0.7908, "step": 1306 }, { "epoch": 0.35973302139957336, "grad_norm": 0.36570132605373035, "learning_rate": 2.8901884047125974e-05, "loss": 0.8066, "step": 1307 }, { "epoch": 0.3600082570701163, "grad_norm": 0.29539814063237785, "learning_rate": 2.8886234127078077e-05, "loss": 0.7843, "step": 1308 }, { "epoch": 0.3602834927406592, "grad_norm": 0.3168307419027681, "learning_rate": 2.8870577424712885e-05, "loss": 0.8095, "step": 1309 }, { "epoch": 0.36055872841120207, "grad_norm": 0.27458946917789645, "learning_rate": 2.8854913951980214e-05, "loss": 0.7595, "step": 1310 }, { "epoch": 0.360833964081745, "grad_norm": 0.3425052587485172, "learning_rate": 2.8839243720835007e-05, "loss": 0.8023, "step": 1311 }, { "epoch": 0.3611091997522879, "grad_norm": 0.47391896360520974, "learning_rate": 2.8823566743237408e-05, "loss": 0.8249, "step": 1312 }, { "epoch": 0.3613844354228308, "grad_norm": 0.31557328858822054, "learning_rate": 2.880788303115269e-05, "loss": 0.8175, "step": 1313 }, { "epoch": 0.3616596710933737, "grad_norm": 0.3369896790299706, "learning_rate": 2.879219259655126e-05, "loss": 0.8222, "step": 1314 }, { "epoch": 0.3619349067639166, "grad_norm": 0.3386795554756731, "learning_rate": 2.8776495451408677e-05, "loss": 0.8229, "step": 1315 }, { "epoch": 0.3622101424344595, "grad_norm": 0.3123954344701337, "learning_rate": 2.8760791607705597e-05, "loss": 0.8012, "step": 1316 }, { "epoch": 0.3624853781050024, "grad_norm": 0.35211511951310254, "learning_rate": 2.87450810774278e-05, "loss": 0.8248, "step": 1317 }, { "epoch": 0.3627606137755453, "grad_norm": 0.31329166669910147, "learning_rate": 2.8729363872566178e-05, "loss": 0.8139, "step": 1318 }, { "epoch": 0.3630358494460882, "grad_norm": 0.34213219153214414, "learning_rate": 2.8713640005116708e-05, "loss": 0.8237, "step": 1319 }, { "epoch": 0.3633110851166311, "grad_norm": 0.3264988397516051, "learning_rate": 2.8697909487080445e-05, "loss": 0.8155, "step": 1320 }, { "epoch": 0.363586320787174, "grad_norm": 0.31116444914451613, "learning_rate": 2.8682172330463536e-05, "loss": 0.8031, "step": 1321 }, { "epoch": 0.3638615564577169, "grad_norm": 0.31559275872970716, "learning_rate": 2.8666428547277186e-05, "loss": 0.8193, "step": 1322 }, { "epoch": 0.36413679212825983, "grad_norm": 0.5090974545519873, "learning_rate": 2.865067814953766e-05, "loss": 0.8016, "step": 1323 }, { "epoch": 0.3644120277988027, "grad_norm": 0.31499198831549413, "learning_rate": 2.863492114926626e-05, "loss": 0.7769, "step": 1324 }, { "epoch": 0.3646872634693456, "grad_norm": 0.3064263553389884, "learning_rate": 2.8619157558489355e-05, "loss": 0.8053, "step": 1325 }, { "epoch": 0.36496249913988854, "grad_norm": 0.3343110758386495, "learning_rate": 2.8603387389238313e-05, "loss": 0.8171, "step": 1326 }, { "epoch": 0.3652377348104314, "grad_norm": 0.3193469070200966, "learning_rate": 2.8587610653549536e-05, "loss": 0.7842, "step": 1327 }, { "epoch": 0.3655129704809743, "grad_norm": 0.3160529255063763, "learning_rate": 2.8571827363464454e-05, "loss": 0.7788, "step": 1328 }, { "epoch": 0.36578820615151725, "grad_norm": 0.28750606319238275, "learning_rate": 2.8556037531029468e-05, "loss": 0.8211, "step": 1329 }, { "epoch": 0.36606344182206013, "grad_norm": 0.3515067314610286, "learning_rate": 2.854024116829599e-05, "loss": 0.7957, "step": 1330 }, { "epoch": 0.366338677492603, "grad_norm": 0.3377351526025328, "learning_rate": 2.852443828732042e-05, "loss": 0.8351, "step": 1331 }, { "epoch": 0.36661391316314595, "grad_norm": 0.30706008799821316, "learning_rate": 2.8508628900164122e-05, "loss": 0.8064, "step": 1332 }, { "epoch": 0.36688914883368884, "grad_norm": 0.3132163933785639, "learning_rate": 2.849281301889344e-05, "loss": 0.7672, "step": 1333 }, { "epoch": 0.3671643845042317, "grad_norm": 0.30601176741209374, "learning_rate": 2.847699065557966e-05, "loss": 0.7908, "step": 1334 }, { "epoch": 0.36743962017477466, "grad_norm": 0.3184427099143359, "learning_rate": 2.846116182229902e-05, "loss": 0.8145, "step": 1335 }, { "epoch": 0.36771485584531755, "grad_norm": 0.30693411282540556, "learning_rate": 2.84453265311327e-05, "loss": 0.8238, "step": 1336 }, { "epoch": 0.36799009151586043, "grad_norm": 0.2985348513413893, "learning_rate": 2.8429484794166798e-05, "loss": 0.7928, "step": 1337 }, { "epoch": 0.36826532718640337, "grad_norm": 0.31666817103569384, "learning_rate": 2.841363662349235e-05, "loss": 0.7872, "step": 1338 }, { "epoch": 0.36854056285694625, "grad_norm": 0.3322566974582257, "learning_rate": 2.8397782031205295e-05, "loss": 0.8004, "step": 1339 }, { "epoch": 0.36881579852748914, "grad_norm": 0.3009981595090159, "learning_rate": 2.8381921029406464e-05, "loss": 0.8346, "step": 1340 }, { "epoch": 0.3690910341980321, "grad_norm": 0.32567627898081886, "learning_rate": 2.8366053630201577e-05, "loss": 0.8052, "step": 1341 }, { "epoch": 0.36936626986857496, "grad_norm": 0.3429285129950298, "learning_rate": 2.8350179845701267e-05, "loss": 0.7973, "step": 1342 }, { "epoch": 0.36964150553911784, "grad_norm": 0.32101941136515527, "learning_rate": 2.8334299688021002e-05, "loss": 0.7935, "step": 1343 }, { "epoch": 0.3699167412096608, "grad_norm": 0.32910331378716223, "learning_rate": 2.8318413169281146e-05, "loss": 0.8145, "step": 1344 }, { "epoch": 0.37019197688020367, "grad_norm": 0.3326953554224791, "learning_rate": 2.830252030160689e-05, "loss": 0.7849, "step": 1345 }, { "epoch": 0.37046721255074655, "grad_norm": 0.3236119360588396, "learning_rate": 2.8286621097128298e-05, "loss": 0.8243, "step": 1346 }, { "epoch": 0.3707424482212895, "grad_norm": 0.3568165246303928, "learning_rate": 2.8270715567980248e-05, "loss": 0.8101, "step": 1347 }, { "epoch": 0.3710176838918324, "grad_norm": 0.3867713763535159, "learning_rate": 2.825480372630246e-05, "loss": 0.8066, "step": 1348 }, { "epoch": 0.37129291956237526, "grad_norm": 0.40846663591430193, "learning_rate": 2.8238885584239458e-05, "loss": 0.8294, "step": 1349 }, { "epoch": 0.3715681552329182, "grad_norm": 0.3200132627526865, "learning_rate": 2.8222961153940595e-05, "loss": 0.7819, "step": 1350 }, { "epoch": 0.3718433909034611, "grad_norm": 0.3602760939053267, "learning_rate": 2.8207030447560003e-05, "loss": 0.7826, "step": 1351 }, { "epoch": 0.37211862657400396, "grad_norm": 0.369247157604994, "learning_rate": 2.819109347725662e-05, "loss": 0.8268, "step": 1352 }, { "epoch": 0.3723938622445469, "grad_norm": 0.3170393651874839, "learning_rate": 2.817515025519415e-05, "loss": 0.7882, "step": 1353 }, { "epoch": 0.3726690979150898, "grad_norm": 0.3326519408046655, "learning_rate": 2.8159200793541078e-05, "loss": 0.768, "step": 1354 }, { "epoch": 0.3729443335856327, "grad_norm": 0.34434308946721537, "learning_rate": 2.8143245104470653e-05, "loss": 0.7953, "step": 1355 }, { "epoch": 0.3732195692561756, "grad_norm": 0.3623225721725796, "learning_rate": 2.812728320016087e-05, "loss": 0.8252, "step": 1356 }, { "epoch": 0.3734948049267185, "grad_norm": 0.315199315152692, "learning_rate": 2.811131509279448e-05, "loss": 0.7848, "step": 1357 }, { "epoch": 0.37377004059726143, "grad_norm": 0.36338809342252487, "learning_rate": 2.8095340794558946e-05, "loss": 0.7896, "step": 1358 }, { "epoch": 0.3740452762678043, "grad_norm": 0.33988187467434927, "learning_rate": 2.8079360317646474e-05, "loss": 0.812, "step": 1359 }, { "epoch": 0.3743205119383472, "grad_norm": 0.2857496207925808, "learning_rate": 2.8063373674253983e-05, "loss": 0.7922, "step": 1360 }, { "epoch": 0.37459574760889014, "grad_norm": 0.3330778243852057, "learning_rate": 2.8047380876583105e-05, "loss": 0.8094, "step": 1361 }, { "epoch": 0.374870983279433, "grad_norm": 0.3244271784314232, "learning_rate": 2.8031381936840153e-05, "loss": 0.8078, "step": 1362 }, { "epoch": 0.3751462189499759, "grad_norm": 0.3250873656533473, "learning_rate": 2.801537686723613e-05, "loss": 0.8411, "step": 1363 }, { "epoch": 0.37542145462051885, "grad_norm": 0.34791160182523184, "learning_rate": 2.7999365679986733e-05, "loss": 0.8581, "step": 1364 }, { "epoch": 0.37569669029106173, "grad_norm": 0.30304224891942977, "learning_rate": 2.798334838731232e-05, "loss": 0.8043, "step": 1365 }, { "epoch": 0.3759719259616046, "grad_norm": 0.3257738410421442, "learning_rate": 2.79673250014379e-05, "loss": 0.8315, "step": 1366 }, { "epoch": 0.37624716163214755, "grad_norm": 0.31452388703585527, "learning_rate": 2.795129553459315e-05, "loss": 0.8372, "step": 1367 }, { "epoch": 0.37652239730269044, "grad_norm": 0.3269610810342665, "learning_rate": 2.793525999901237e-05, "loss": 0.8201, "step": 1368 }, { "epoch": 0.3767976329732333, "grad_norm": 0.3293112124741156, "learning_rate": 2.79192184069345e-05, "loss": 0.8111, "step": 1369 }, { "epoch": 0.37707286864377626, "grad_norm": 0.3190169318875583, "learning_rate": 2.7903170770603113e-05, "loss": 0.8161, "step": 1370 }, { "epoch": 0.37734810431431914, "grad_norm": 0.335441432819471, "learning_rate": 2.7887117102266373e-05, "loss": 0.7934, "step": 1371 }, { "epoch": 0.377623339984862, "grad_norm": 0.3194213789437805, "learning_rate": 2.787105741417707e-05, "loss": 0.7942, "step": 1372 }, { "epoch": 0.37789857565540497, "grad_norm": 0.343590881106002, "learning_rate": 2.7854991718592573e-05, "loss": 0.8043, "step": 1373 }, { "epoch": 0.37817381132594785, "grad_norm": 0.35997843073088864, "learning_rate": 2.783892002777484e-05, "loss": 0.8008, "step": 1374 }, { "epoch": 0.37844904699649073, "grad_norm": 0.3612927520612644, "learning_rate": 2.7822842353990412e-05, "loss": 0.8154, "step": 1375 }, { "epoch": 0.3787242826670337, "grad_norm": 0.33095297214331687, "learning_rate": 2.780675870951039e-05, "loss": 0.8079, "step": 1376 }, { "epoch": 0.37899951833757656, "grad_norm": 0.31220387603827354, "learning_rate": 2.779066910661043e-05, "loss": 0.7997, "step": 1377 }, { "epoch": 0.37927475400811944, "grad_norm": 0.31939710952173245, "learning_rate": 2.7774573557570743e-05, "loss": 0.7874, "step": 1378 }, { "epoch": 0.3795499896786624, "grad_norm": 0.32963907224176453, "learning_rate": 2.775847207467607e-05, "loss": 0.7906, "step": 1379 }, { "epoch": 0.37982522534920526, "grad_norm": 0.3037381999100857, "learning_rate": 2.7742364670215686e-05, "loss": 0.8022, "step": 1380 }, { "epoch": 0.38010046101974815, "grad_norm": 0.3031129728217763, "learning_rate": 2.772625135648338e-05, "loss": 0.8284, "step": 1381 }, { "epoch": 0.3803756966902911, "grad_norm": 0.28316725699609346, "learning_rate": 2.7710132145777465e-05, "loss": 0.7782, "step": 1382 }, { "epoch": 0.38065093236083397, "grad_norm": 0.34053413103169805, "learning_rate": 2.7694007050400743e-05, "loss": 0.7869, "step": 1383 }, { "epoch": 0.38092616803137685, "grad_norm": 0.28214837984365054, "learning_rate": 2.7677876082660504e-05, "loss": 0.7928, "step": 1384 }, { "epoch": 0.3812014037019198, "grad_norm": 0.3544908061657199, "learning_rate": 2.7661739254868534e-05, "loss": 0.8122, "step": 1385 }, { "epoch": 0.3814766393724627, "grad_norm": 0.32836527286990924, "learning_rate": 2.7645596579341077e-05, "loss": 0.8134, "step": 1386 }, { "epoch": 0.38175187504300556, "grad_norm": 0.32829850154101425, "learning_rate": 2.762944806839885e-05, "loss": 0.8211, "step": 1387 }, { "epoch": 0.3820271107135485, "grad_norm": 0.33511536227788746, "learning_rate": 2.7613293734367014e-05, "loss": 0.8221, "step": 1388 }, { "epoch": 0.3823023463840914, "grad_norm": 0.3394497631985741, "learning_rate": 2.7597133589575197e-05, "loss": 0.8226, "step": 1389 }, { "epoch": 0.38257758205463427, "grad_norm": 0.2910092588497054, "learning_rate": 2.758096764635743e-05, "loss": 0.7918, "step": 1390 }, { "epoch": 0.3828528177251772, "grad_norm": 0.4023183538104028, "learning_rate": 2.7564795917052194e-05, "loss": 0.803, "step": 1391 }, { "epoch": 0.3831280533957201, "grad_norm": 0.36955267265567154, "learning_rate": 2.7548618414002368e-05, "loss": 0.793, "step": 1392 }, { "epoch": 0.383403289066263, "grad_norm": 0.31507516369283994, "learning_rate": 2.7532435149555268e-05, "loss": 0.7956, "step": 1393 }, { "epoch": 0.3836785247368059, "grad_norm": 0.3773775660294882, "learning_rate": 2.7516246136062567e-05, "loss": 0.7838, "step": 1394 }, { "epoch": 0.3839537604073488, "grad_norm": 0.37501625627895047, "learning_rate": 2.7500051385880347e-05, "loss": 0.7738, "step": 1395 }, { "epoch": 0.3842289960778917, "grad_norm": 0.2974661971665518, "learning_rate": 2.748385091136908e-05, "loss": 0.8174, "step": 1396 }, { "epoch": 0.3845042317484346, "grad_norm": 0.3505183922056784, "learning_rate": 2.7467644724893583e-05, "loss": 0.8054, "step": 1397 }, { "epoch": 0.3847794674189775, "grad_norm": 0.30899185617899977, "learning_rate": 2.7451432838823047e-05, "loss": 0.7879, "step": 1398 }, { "epoch": 0.3850547030895204, "grad_norm": 0.335348274613658, "learning_rate": 2.743521526553101e-05, "loss": 0.8324, "step": 1399 }, { "epoch": 0.38532993876006333, "grad_norm": 0.3633470391262056, "learning_rate": 2.741899201739536e-05, "loss": 0.7793, "step": 1400 }, { "epoch": 0.3856051744306062, "grad_norm": 0.35671470264157973, "learning_rate": 2.7402763106798295e-05, "loss": 0.7812, "step": 1401 }, { "epoch": 0.3858804101011491, "grad_norm": 0.44871307083826245, "learning_rate": 2.7386528546126342e-05, "loss": 0.7731, "step": 1402 }, { "epoch": 0.38615564577169204, "grad_norm": 0.3400205353421276, "learning_rate": 2.7370288347770358e-05, "loss": 0.7992, "step": 1403 }, { "epoch": 0.3864308814422349, "grad_norm": 0.3187980198359592, "learning_rate": 2.7354042524125483e-05, "loss": 0.8159, "step": 1404 }, { "epoch": 0.3867061171127778, "grad_norm": 0.28815150525493677, "learning_rate": 2.7337791087591162e-05, "loss": 0.8013, "step": 1405 }, { "epoch": 0.38698135278332074, "grad_norm": 0.34147251379145943, "learning_rate": 2.7321534050571115e-05, "loss": 0.8073, "step": 1406 }, { "epoch": 0.3872565884538636, "grad_norm": 0.3065715702209035, "learning_rate": 2.7305271425473345e-05, "loss": 0.7939, "step": 1407 }, { "epoch": 0.3875318241244065, "grad_norm": 0.2981957054027009, "learning_rate": 2.7289003224710103e-05, "loss": 0.8513, "step": 1408 }, { "epoch": 0.38780705979494945, "grad_norm": 0.30110790776698665, "learning_rate": 2.7272729460697927e-05, "loss": 0.7819, "step": 1409 }, { "epoch": 0.38808229546549233, "grad_norm": 0.30481861252779535, "learning_rate": 2.7256450145857578e-05, "loss": 0.8105, "step": 1410 }, { "epoch": 0.3883575311360352, "grad_norm": 0.2987766885062007, "learning_rate": 2.7240165292614055e-05, "loss": 0.8198, "step": 1411 }, { "epoch": 0.38863276680657816, "grad_norm": 0.32115251798317024, "learning_rate": 2.722387491339658e-05, "loss": 0.8008, "step": 1412 }, { "epoch": 0.38890800247712104, "grad_norm": 0.33777018743725346, "learning_rate": 2.720757902063861e-05, "loss": 0.7782, "step": 1413 }, { "epoch": 0.3891832381476639, "grad_norm": 0.3296690616192377, "learning_rate": 2.71912776267778e-05, "loss": 0.8215, "step": 1414 }, { "epoch": 0.38945847381820686, "grad_norm": 0.31869005654181193, "learning_rate": 2.7174970744256e-05, "loss": 0.7769, "step": 1415 }, { "epoch": 0.38973370948874975, "grad_norm": 0.31483879090434064, "learning_rate": 2.715865838551925e-05, "loss": 0.817, "step": 1416 }, { "epoch": 0.39000894515929263, "grad_norm": 0.31195017324740143, "learning_rate": 2.714234056301778e-05, "loss": 0.8031, "step": 1417 }, { "epoch": 0.39028418082983557, "grad_norm": 0.3339314715373165, "learning_rate": 2.7126017289205977e-05, "loss": 0.8306, "step": 1418 }, { "epoch": 0.39055941650037845, "grad_norm": 0.5165042933029558, "learning_rate": 2.71096885765424e-05, "loss": 0.7939, "step": 1419 }, { "epoch": 0.39083465217092134, "grad_norm": 0.3325636248893185, "learning_rate": 2.7093354437489744e-05, "loss": 0.823, "step": 1420 }, { "epoch": 0.3911098878414643, "grad_norm": 0.32175997339050016, "learning_rate": 2.7077014884514867e-05, "loss": 0.8238, "step": 1421 }, { "epoch": 0.39138512351200716, "grad_norm": 0.3183119811402347, "learning_rate": 2.7060669930088744e-05, "loss": 0.7902, "step": 1422 }, { "epoch": 0.39166035918255004, "grad_norm": 0.3190233340184448, "learning_rate": 2.7044319586686464e-05, "loss": 0.7957, "step": 1423 }, { "epoch": 0.391935594853093, "grad_norm": 0.358994548429445, "learning_rate": 2.7027963866787255e-05, "loss": 0.7982, "step": 1424 }, { "epoch": 0.39221083052363587, "grad_norm": 0.2813243071562666, "learning_rate": 2.701160278287443e-05, "loss": 0.7993, "step": 1425 }, { "epoch": 0.39248606619417875, "grad_norm": 0.361100554529105, "learning_rate": 2.6995236347435402e-05, "loss": 0.8183, "step": 1426 }, { "epoch": 0.3927613018647217, "grad_norm": 0.297859261527035, "learning_rate": 2.697886457296166e-05, "loss": 0.8051, "step": 1427 }, { "epoch": 0.3930365375352646, "grad_norm": 0.3076915112032659, "learning_rate": 2.6962487471948787e-05, "loss": 0.8015, "step": 1428 }, { "epoch": 0.39331177320580746, "grad_norm": 0.30013010580384464, "learning_rate": 2.6946105056896406e-05, "loss": 0.8217, "step": 1429 }, { "epoch": 0.3935870088763504, "grad_norm": 0.3067900479520812, "learning_rate": 2.692971734030822e-05, "loss": 0.8357, "step": 1430 }, { "epoch": 0.3938622445468933, "grad_norm": 0.33042204011575527, "learning_rate": 2.6913324334691965e-05, "loss": 0.8187, "step": 1431 }, { "epoch": 0.39413748021743616, "grad_norm": 1.0235278366123177, "learning_rate": 2.6896926052559412e-05, "loss": 0.8055, "step": 1432 }, { "epoch": 0.3944127158879791, "grad_norm": 0.3132802404760063, "learning_rate": 2.688052250642637e-05, "loss": 0.8033, "step": 1433 }, { "epoch": 0.394687951558522, "grad_norm": 0.2859859821046954, "learning_rate": 2.6864113708812652e-05, "loss": 0.8039, "step": 1434 }, { "epoch": 0.39496318722906487, "grad_norm": 0.31601424136481243, "learning_rate": 2.6847699672242086e-05, "loss": 0.7931, "step": 1435 }, { "epoch": 0.3952384228996078, "grad_norm": 0.36800909101564766, "learning_rate": 2.683128040924251e-05, "loss": 0.8275, "step": 1436 }, { "epoch": 0.3955136585701507, "grad_norm": 0.3155541640988661, "learning_rate": 2.6814855932345733e-05, "loss": 0.7825, "step": 1437 }, { "epoch": 0.3957888942406936, "grad_norm": 0.3373869372996058, "learning_rate": 2.679842625408755e-05, "loss": 0.7869, "step": 1438 }, { "epoch": 0.3960641299112365, "grad_norm": 0.32653211745613714, "learning_rate": 2.6781991387007725e-05, "loss": 0.8131, "step": 1439 }, { "epoch": 0.3963393655817794, "grad_norm": 0.3268275942679181, "learning_rate": 2.676555134364999e-05, "loss": 0.7823, "step": 1440 }, { "epoch": 0.3966146012523223, "grad_norm": 0.30157734350933985, "learning_rate": 2.674910613656201e-05, "loss": 0.8052, "step": 1441 }, { "epoch": 0.3968898369228652, "grad_norm": 0.33513607003793183, "learning_rate": 2.6732655778295416e-05, "loss": 0.7968, "step": 1442 }, { "epoch": 0.3971650725934081, "grad_norm": 0.2988598809491412, "learning_rate": 2.671620028140575e-05, "loss": 0.8164, "step": 1443 }, { "epoch": 0.397440308263951, "grad_norm": 0.2974119303314532, "learning_rate": 2.6699739658452488e-05, "loss": 0.7867, "step": 1444 }, { "epoch": 0.39771554393449393, "grad_norm": 0.2927689415610301, "learning_rate": 2.6683273921999e-05, "loss": 0.7959, "step": 1445 }, { "epoch": 0.3979907796050368, "grad_norm": 0.3034062685357057, "learning_rate": 2.6666803084612586e-05, "loss": 0.7609, "step": 1446 }, { "epoch": 0.3982660152755797, "grad_norm": 0.29511262430624435, "learning_rate": 2.6650327158864423e-05, "loss": 0.8057, "step": 1447 }, { "epoch": 0.39854125094612264, "grad_norm": 0.31100547478830526, "learning_rate": 2.663384615732957e-05, "loss": 0.8007, "step": 1448 }, { "epoch": 0.3988164866166655, "grad_norm": 0.2987495592927288, "learning_rate": 2.6617360092586973e-05, "loss": 0.7742, "step": 1449 }, { "epoch": 0.3990917222872084, "grad_norm": 0.29353723403945653, "learning_rate": 2.6600868977219428e-05, "loss": 0.7967, "step": 1450 }, { "epoch": 0.39936695795775135, "grad_norm": 0.2943294262418574, "learning_rate": 2.6584372823813588e-05, "loss": 0.7832, "step": 1451 }, { "epoch": 0.39964219362829423, "grad_norm": 0.28764564181784025, "learning_rate": 2.6567871644959954e-05, "loss": 0.8084, "step": 1452 }, { "epoch": 0.3999174292988371, "grad_norm": 0.30420633598929464, "learning_rate": 2.6551365453252872e-05, "loss": 0.83, "step": 1453 }, { "epoch": 0.40019266496938005, "grad_norm": 0.28961885854550073, "learning_rate": 2.6534854261290504e-05, "loss": 0.8253, "step": 1454 }, { "epoch": 0.40046790063992294, "grad_norm": 0.29650316278232675, "learning_rate": 2.651833808167482e-05, "loss": 0.7987, "step": 1455 }, { "epoch": 0.4007431363104658, "grad_norm": 0.28840641987030097, "learning_rate": 2.6501816927011616e-05, "loss": 0.808, "step": 1456 }, { "epoch": 0.40101837198100876, "grad_norm": 0.2632624554330908, "learning_rate": 2.6485290809910473e-05, "loss": 0.7983, "step": 1457 }, { "epoch": 0.40129360765155164, "grad_norm": 0.3010079679168447, "learning_rate": 2.6468759742984763e-05, "loss": 0.8227, "step": 1458 }, { "epoch": 0.4015688433220945, "grad_norm": 0.28604558779891986, "learning_rate": 2.6452223738851634e-05, "loss": 0.8147, "step": 1459 }, { "epoch": 0.40184407899263747, "grad_norm": 0.2976386009217243, "learning_rate": 2.6435682810132007e-05, "loss": 0.772, "step": 1460 }, { "epoch": 0.40211931466318035, "grad_norm": 0.2752057004017899, "learning_rate": 2.641913696945055e-05, "loss": 0.8028, "step": 1461 }, { "epoch": 0.40239455033372323, "grad_norm": 1.8613068342598982, "learning_rate": 2.6402586229435694e-05, "loss": 0.8125, "step": 1462 }, { "epoch": 0.4026697860042662, "grad_norm": 0.33706353100482705, "learning_rate": 2.63860306027196e-05, "loss": 0.8084, "step": 1463 }, { "epoch": 0.40294502167480906, "grad_norm": 0.306332125455289, "learning_rate": 2.636947010193817e-05, "loss": 0.7956, "step": 1464 }, { "epoch": 0.40322025734535194, "grad_norm": 0.32297963436227717, "learning_rate": 2.6352904739731007e-05, "loss": 0.8011, "step": 1465 }, { "epoch": 0.4034954930158949, "grad_norm": 0.874575402840066, "learning_rate": 2.6336334528741442e-05, "loss": 0.8164, "step": 1466 }, { "epoch": 0.40377072868643776, "grad_norm": 0.8744408711668231, "learning_rate": 2.63197594816165e-05, "loss": 0.8105, "step": 1467 }, { "epoch": 0.40404596435698065, "grad_norm": 0.3646798763416938, "learning_rate": 2.6303179611006896e-05, "loss": 0.8017, "step": 1468 }, { "epoch": 0.4043212000275236, "grad_norm": 0.32039312801768444, "learning_rate": 2.628659492956703e-05, "loss": 0.8154, "step": 1469 }, { "epoch": 0.40459643569806647, "grad_norm": 0.3728939261903309, "learning_rate": 2.6270005449954972e-05, "loss": 0.8188, "step": 1470 }, { "epoch": 0.40487167136860935, "grad_norm": 0.34239940638498284, "learning_rate": 2.6253411184832454e-05, "loss": 0.8038, "step": 1471 }, { "epoch": 0.4051469070391523, "grad_norm": 0.34487757127620067, "learning_rate": 2.6236812146864853e-05, "loss": 0.7801, "step": 1472 }, { "epoch": 0.4054221427096952, "grad_norm": 0.36325221927535567, "learning_rate": 2.62202083487212e-05, "loss": 0.822, "step": 1473 }, { "epoch": 0.40569737838023806, "grad_norm": 0.37044590278077094, "learning_rate": 2.6203599803074165e-05, "loss": 0.8536, "step": 1474 }, { "epoch": 0.405972614050781, "grad_norm": 0.4293282038813741, "learning_rate": 2.6186986522600023e-05, "loss": 0.7903, "step": 1475 }, { "epoch": 0.4062478497213239, "grad_norm": 0.31220366799140137, "learning_rate": 2.617036851997867e-05, "loss": 0.7654, "step": 1476 }, { "epoch": 0.40652308539186677, "grad_norm": 0.3269640608562408, "learning_rate": 2.6153745807893615e-05, "loss": 0.7918, "step": 1477 }, { "epoch": 0.4067983210624097, "grad_norm": 0.3257998229141931, "learning_rate": 2.6137118399031946e-05, "loss": 0.8108, "step": 1478 }, { "epoch": 0.4070735567329526, "grad_norm": 0.38549640403772, "learning_rate": 2.612048630608435e-05, "loss": 0.8208, "step": 1479 }, { "epoch": 0.4073487924034955, "grad_norm": 0.33409143137306907, "learning_rate": 2.6103849541745085e-05, "loss": 0.7759, "step": 1480 }, { "epoch": 0.4076240280740384, "grad_norm": 0.300974685979459, "learning_rate": 2.608720811871196e-05, "loss": 0.8014, "step": 1481 }, { "epoch": 0.4078992637445813, "grad_norm": 0.3072004155673629, "learning_rate": 2.607056204968637e-05, "loss": 0.7928, "step": 1482 }, { "epoch": 0.4081744994151242, "grad_norm": 0.44751117347870023, "learning_rate": 2.605391134737322e-05, "loss": 0.7873, "step": 1483 }, { "epoch": 0.4084497350856671, "grad_norm": 0.29147020206747637, "learning_rate": 2.6037256024480985e-05, "loss": 0.819, "step": 1484 }, { "epoch": 0.40872497075621, "grad_norm": 0.29787047831873453, "learning_rate": 2.6020596093721643e-05, "loss": 0.7967, "step": 1485 }, { "epoch": 0.4090002064267529, "grad_norm": 0.3234064883399612, "learning_rate": 2.60039315678107e-05, "loss": 0.8082, "step": 1486 }, { "epoch": 0.4092754420972958, "grad_norm": 0.2822883935726763, "learning_rate": 2.5987262459467168e-05, "loss": 0.7919, "step": 1487 }, { "epoch": 0.4095506777678387, "grad_norm": 0.337762827412016, "learning_rate": 2.597058878141354e-05, "loss": 0.824, "step": 1488 }, { "epoch": 0.4098259134383816, "grad_norm": 0.3351266198568725, "learning_rate": 2.5953910546375827e-05, "loss": 0.8169, "step": 1489 }, { "epoch": 0.41010114910892453, "grad_norm": 0.27981369269275125, "learning_rate": 2.5937227767083503e-05, "loss": 0.7986, "step": 1490 }, { "epoch": 0.4103763847794674, "grad_norm": 0.33122050921440876, "learning_rate": 2.59205404562695e-05, "loss": 0.7831, "step": 1491 }, { "epoch": 0.4106516204500103, "grad_norm": 0.31103569343053505, "learning_rate": 2.5903848626670227e-05, "loss": 0.7963, "step": 1492 }, { "epoch": 0.41092685612055324, "grad_norm": 0.2872677075818124, "learning_rate": 2.5887152291025532e-05, "loss": 0.7874, "step": 1493 }, { "epoch": 0.4112020917910961, "grad_norm": 0.2803646269116244, "learning_rate": 2.5870451462078697e-05, "loss": 0.8081, "step": 1494 }, { "epoch": 0.411477327461639, "grad_norm": 0.2887233841614559, "learning_rate": 2.5853746152576443e-05, "loss": 0.8068, "step": 1495 }, { "epoch": 0.41175256313218195, "grad_norm": 0.2691437300859037, "learning_rate": 2.5837036375268916e-05, "loss": 0.807, "step": 1496 }, { "epoch": 0.41202779880272483, "grad_norm": 0.2939469370716576, "learning_rate": 2.582032214290966e-05, "loss": 0.8074, "step": 1497 }, { "epoch": 0.4123030344732677, "grad_norm": 0.2962223128230255, "learning_rate": 2.5803603468255612e-05, "loss": 0.784, "step": 1498 }, { "epoch": 0.41257827014381065, "grad_norm": 0.30684610616954827, "learning_rate": 2.5786880364067118e-05, "loss": 0.8177, "step": 1499 }, { "epoch": 0.41285350581435354, "grad_norm": 0.30165991323175034, "learning_rate": 2.5770152843107906e-05, "loss": 0.7854, "step": 1500 }, { "epoch": 0.4131287414848964, "grad_norm": 0.344845393306954, "learning_rate": 2.5753420918145054e-05, "loss": 0.7884, "step": 1501 }, { "epoch": 0.41340397715543936, "grad_norm": 0.2749121369417589, "learning_rate": 2.5736684601949016e-05, "loss": 0.7875, "step": 1502 }, { "epoch": 0.41367921282598225, "grad_norm": 0.31662733333357823, "learning_rate": 2.5719943907293604e-05, "loss": 0.7919, "step": 1503 }, { "epoch": 0.41395444849652513, "grad_norm": 0.4084878913616865, "learning_rate": 2.5703198846955948e-05, "loss": 0.7965, "step": 1504 }, { "epoch": 0.41422968416706807, "grad_norm": 0.28272609789134145, "learning_rate": 2.5686449433716542e-05, "loss": 0.8028, "step": 1505 }, { "epoch": 0.41450491983761095, "grad_norm": 0.3092023302292874, "learning_rate": 2.5669695680359173e-05, "loss": 0.7992, "step": 1506 }, { "epoch": 0.41478015550815384, "grad_norm": 0.29500853646346326, "learning_rate": 2.5652937599670962e-05, "loss": 0.83, "step": 1507 }, { "epoch": 0.4150553911786968, "grad_norm": 0.30316421568834717, "learning_rate": 2.5636175204442317e-05, "loss": 0.819, "step": 1508 }, { "epoch": 0.41533062684923966, "grad_norm": 0.2837657249146373, "learning_rate": 2.5619408507466945e-05, "loss": 0.7702, "step": 1509 }, { "epoch": 0.41560586251978254, "grad_norm": 0.2872567530513789, "learning_rate": 2.560263752154184e-05, "loss": 0.8166, "step": 1510 }, { "epoch": 0.4158810981903255, "grad_norm": 0.2933075992543045, "learning_rate": 2.5585862259467274e-05, "loss": 0.8066, "step": 1511 }, { "epoch": 0.41615633386086837, "grad_norm": 0.32175892636432013, "learning_rate": 2.5569082734046765e-05, "loss": 0.8005, "step": 1512 }, { "epoch": 0.41643156953141125, "grad_norm": 0.29372399113648706, "learning_rate": 2.555229895808709e-05, "loss": 0.7922, "step": 1513 }, { "epoch": 0.4167068052019542, "grad_norm": 0.29651349158098117, "learning_rate": 2.553551094439829e-05, "loss": 0.7814, "step": 1514 }, { "epoch": 0.4169820408724971, "grad_norm": 0.31559421692998985, "learning_rate": 2.5518718705793618e-05, "loss": 0.7965, "step": 1515 }, { "epoch": 0.41725727654303996, "grad_norm": 0.291214389213605, "learning_rate": 2.5501922255089563e-05, "loss": 0.8009, "step": 1516 }, { "epoch": 0.4175325122135829, "grad_norm": 0.28998397210337973, "learning_rate": 2.5485121605105825e-05, "loss": 0.8044, "step": 1517 }, { "epoch": 0.4178077478841258, "grad_norm": 0.2688054846484204, "learning_rate": 2.54683167686653e-05, "loss": 0.8056, "step": 1518 }, { "epoch": 0.41808298355466866, "grad_norm": 0.2938280545832689, "learning_rate": 2.5451507758594106e-05, "loss": 0.7715, "step": 1519 }, { "epoch": 0.4183582192252116, "grad_norm": 0.3213269323332592, "learning_rate": 2.543469458772151e-05, "loss": 0.8034, "step": 1520 }, { "epoch": 0.4186334548957545, "grad_norm": 0.28593880805933963, "learning_rate": 2.5417877268879987e-05, "loss": 0.8068, "step": 1521 }, { "epoch": 0.41890869056629737, "grad_norm": 0.35744553405482726, "learning_rate": 2.540105581490516e-05, "loss": 0.7807, "step": 1522 }, { "epoch": 0.4191839262368403, "grad_norm": 0.30165493632045265, "learning_rate": 2.5384230238635814e-05, "loss": 0.8216, "step": 1523 }, { "epoch": 0.4194591619073832, "grad_norm": 0.3165521422964494, "learning_rate": 2.5367400552913876e-05, "loss": 0.8086, "step": 1524 }, { "epoch": 0.4197343975779261, "grad_norm": 0.3100636628963957, "learning_rate": 2.5350566770584423e-05, "loss": 0.7844, "step": 1525 }, { "epoch": 0.420009633248469, "grad_norm": 0.280191236586672, "learning_rate": 2.5333728904495633e-05, "loss": 0.7865, "step": 1526 }, { "epoch": 0.4202848689190119, "grad_norm": 0.29718483835011467, "learning_rate": 2.531688696749882e-05, "loss": 0.7895, "step": 1527 }, { "epoch": 0.4205601045895548, "grad_norm": 0.3189160667948843, "learning_rate": 2.5300040972448407e-05, "loss": 0.7886, "step": 1528 }, { "epoch": 0.4208353402600977, "grad_norm": 0.346648339637172, "learning_rate": 2.5283190932201905e-05, "loss": 0.813, "step": 1529 }, { "epoch": 0.4211105759306406, "grad_norm": 0.4196447797342443, "learning_rate": 2.526633685961992e-05, "loss": 0.7752, "step": 1530 }, { "epoch": 0.4213858116011835, "grad_norm": 0.3591666823765334, "learning_rate": 2.5249478767566128e-05, "loss": 0.7983, "step": 1531 }, { "epoch": 0.42166104727172643, "grad_norm": 0.3157911243028244, "learning_rate": 2.5232616668907272e-05, "loss": 0.7752, "step": 1532 }, { "epoch": 0.4219362829422693, "grad_norm": 0.3469253855461703, "learning_rate": 2.521575057651317e-05, "loss": 0.8002, "step": 1533 }, { "epoch": 0.4222115186128122, "grad_norm": 0.37750370249049303, "learning_rate": 2.5198880503256656e-05, "loss": 0.7877, "step": 1534 }, { "epoch": 0.42248675428335514, "grad_norm": 0.3462659714898315, "learning_rate": 2.518200646201364e-05, "loss": 0.8244, "step": 1535 }, { "epoch": 0.422761989953898, "grad_norm": 0.40500355446545444, "learning_rate": 2.5165128465663035e-05, "loss": 0.8043, "step": 1536 }, { "epoch": 0.4230372256244409, "grad_norm": 0.3376593793085698, "learning_rate": 2.5148246527086773e-05, "loss": 0.8066, "step": 1537 }, { "epoch": 0.42331246129498384, "grad_norm": 0.31106951332736665, "learning_rate": 2.5131360659169817e-05, "loss": 0.8054, "step": 1538 }, { "epoch": 0.4235876969655267, "grad_norm": 0.32605582211855666, "learning_rate": 2.5114470874800106e-05, "loss": 0.7953, "step": 1539 }, { "epoch": 0.4238629326360696, "grad_norm": 0.32233029068351515, "learning_rate": 2.509757718686858e-05, "loss": 0.7968, "step": 1540 }, { "epoch": 0.42413816830661255, "grad_norm": 0.3141658510318051, "learning_rate": 2.5080679608269143e-05, "loss": 0.825, "step": 1541 }, { "epoch": 0.42441340397715543, "grad_norm": 0.3429314163930497, "learning_rate": 2.5063778151898688e-05, "loss": 0.769, "step": 1542 }, { "epoch": 0.4246886396476983, "grad_norm": 0.3532207907958763, "learning_rate": 2.504687283065707e-05, "loss": 0.7781, "step": 1543 }, { "epoch": 0.42496387531824126, "grad_norm": 0.31220809786001236, "learning_rate": 2.5029963657447063e-05, "loss": 0.8076, "step": 1544 }, { "epoch": 0.42523911098878414, "grad_norm": 0.34181803029550617, "learning_rate": 2.5013050645174414e-05, "loss": 0.7757, "step": 1545 }, { "epoch": 0.425514346659327, "grad_norm": 0.2934744581681451, "learning_rate": 2.4996133806747786e-05, "loss": 0.8182, "step": 1546 }, { "epoch": 0.42578958232986996, "grad_norm": 0.2954462476060033, "learning_rate": 2.4979213155078758e-05, "loss": 0.8154, "step": 1547 }, { "epoch": 0.42606481800041285, "grad_norm": 0.30627584397965296, "learning_rate": 2.4962288703081833e-05, "loss": 0.7958, "step": 1548 }, { "epoch": 0.42634005367095573, "grad_norm": 0.3184444669803208, "learning_rate": 2.4945360463674408e-05, "loss": 0.7958, "step": 1549 }, { "epoch": 0.42661528934149867, "grad_norm": 0.29221372217687863, "learning_rate": 2.492842844977677e-05, "loss": 0.8376, "step": 1550 }, { "epoch": 0.42689052501204156, "grad_norm": 0.30012765565232413, "learning_rate": 2.4911492674312072e-05, "loss": 0.807, "step": 1551 }, { "epoch": 0.42716576068258444, "grad_norm": 0.31353031412169613, "learning_rate": 2.4894553150206364e-05, "loss": 0.7936, "step": 1552 }, { "epoch": 0.4274409963531274, "grad_norm": 0.2990620959446403, "learning_rate": 2.4877609890388544e-05, "loss": 0.7894, "step": 1553 }, { "epoch": 0.42771623202367026, "grad_norm": 0.3214884522984842, "learning_rate": 2.4860662907790363e-05, "loss": 0.7982, "step": 1554 }, { "epoch": 0.42799146769421315, "grad_norm": 0.30848511206629325, "learning_rate": 2.484371221534641e-05, "loss": 0.7795, "step": 1555 }, { "epoch": 0.4282667033647561, "grad_norm": 0.289204480093799, "learning_rate": 2.4826757825994116e-05, "loss": 0.829, "step": 1556 }, { "epoch": 0.42854193903529897, "grad_norm": 0.28512723873036044, "learning_rate": 2.480979975267372e-05, "loss": 0.7994, "step": 1557 }, { "epoch": 0.42881717470584185, "grad_norm": 0.34065328908174497, "learning_rate": 2.4792838008328273e-05, "loss": 0.7948, "step": 1558 }, { "epoch": 0.4290924103763848, "grad_norm": 0.2985031897281868, "learning_rate": 2.4775872605903644e-05, "loss": 0.8079, "step": 1559 }, { "epoch": 0.4293676460469277, "grad_norm": 0.3198267846931661, "learning_rate": 2.4758903558348485e-05, "loss": 0.7749, "step": 1560 }, { "epoch": 0.42964288171747056, "grad_norm": 0.36530245462620264, "learning_rate": 2.474193087861422e-05, "loss": 0.7844, "step": 1561 }, { "epoch": 0.4299181173880135, "grad_norm": 0.29563438375387263, "learning_rate": 2.472495457965506e-05, "loss": 0.7743, "step": 1562 }, { "epoch": 0.4301933530585564, "grad_norm": 0.2953487472265621, "learning_rate": 2.470797467442797e-05, "loss": 0.8117, "step": 1563 }, { "epoch": 0.43046858872909927, "grad_norm": 0.3279910813270692, "learning_rate": 2.4690991175892663e-05, "loss": 0.8109, "step": 1564 }, { "epoch": 0.4307438243996422, "grad_norm": 0.32686073979880587, "learning_rate": 2.467400409701162e-05, "loss": 0.8147, "step": 1565 }, { "epoch": 0.4310190600701851, "grad_norm": 0.2893424695495428, "learning_rate": 2.465701345075002e-05, "loss": 0.8046, "step": 1566 }, { "epoch": 0.431294295740728, "grad_norm": 0.3173272950085369, "learning_rate": 2.4640019250075788e-05, "loss": 0.7748, "step": 1567 }, { "epoch": 0.4315695314112709, "grad_norm": 0.27879186742790907, "learning_rate": 2.4623021507959552e-05, "loss": 0.8055, "step": 1568 }, { "epoch": 0.4318447670818138, "grad_norm": 0.34456420267891086, "learning_rate": 2.4606020237374644e-05, "loss": 0.7962, "step": 1569 }, { "epoch": 0.4321200027523567, "grad_norm": 0.29678832958335566, "learning_rate": 2.458901545129709e-05, "loss": 0.7965, "step": 1570 }, { "epoch": 0.4323952384228996, "grad_norm": 0.3142548013817184, "learning_rate": 2.457200716270561e-05, "loss": 0.8115, "step": 1571 }, { "epoch": 0.4326704740934425, "grad_norm": 0.29426185891272927, "learning_rate": 2.455499538458158e-05, "loss": 0.7971, "step": 1572 }, { "epoch": 0.4329457097639854, "grad_norm": 0.3232925060411943, "learning_rate": 2.453798012990904e-05, "loss": 0.8027, "step": 1573 }, { "epoch": 0.4332209454345283, "grad_norm": 0.2851698569966, "learning_rate": 2.45209614116747e-05, "loss": 0.8112, "step": 1574 }, { "epoch": 0.4334961811050712, "grad_norm": 0.3710782702944274, "learning_rate": 2.4503939242867894e-05, "loss": 0.7781, "step": 1575 }, { "epoch": 0.4337714167756141, "grad_norm": 0.2958423052963948, "learning_rate": 2.4486913636480614e-05, "loss": 0.7993, "step": 1576 }, { "epoch": 0.43404665244615703, "grad_norm": 0.284930887135061, "learning_rate": 2.4469884605507446e-05, "loss": 0.8023, "step": 1577 }, { "epoch": 0.4343218881166999, "grad_norm": 0.314246053196774, "learning_rate": 2.445285216294561e-05, "loss": 0.768, "step": 1578 }, { "epoch": 0.4345971237872428, "grad_norm": 0.28396390598718796, "learning_rate": 2.443581632179493e-05, "loss": 0.7908, "step": 1579 }, { "epoch": 0.43487235945778574, "grad_norm": 0.3128320861472054, "learning_rate": 2.4418777095057803e-05, "loss": 0.7853, "step": 1580 }, { "epoch": 0.4351475951283286, "grad_norm": 0.30881869705845577, "learning_rate": 2.4401734495739243e-05, "loss": 0.8109, "step": 1581 }, { "epoch": 0.4354228307988715, "grad_norm": 0.30120876064722846, "learning_rate": 2.4384688536846813e-05, "loss": 0.805, "step": 1582 }, { "epoch": 0.43569806646941445, "grad_norm": 0.31066632616537543, "learning_rate": 2.4367639231390645e-05, "loss": 0.7703, "step": 1583 }, { "epoch": 0.43597330213995733, "grad_norm": 0.3004766739033846, "learning_rate": 2.4350586592383424e-05, "loss": 0.8056, "step": 1584 }, { "epoch": 0.4362485378105002, "grad_norm": 0.2833664052327661, "learning_rate": 2.433353063284039e-05, "loss": 0.7685, "step": 1585 }, { "epoch": 0.43652377348104315, "grad_norm": 0.2811209308675284, "learning_rate": 2.4316471365779317e-05, "loss": 0.8157, "step": 1586 }, { "epoch": 0.43679900915158604, "grad_norm": 0.288913620983614, "learning_rate": 2.4299408804220485e-05, "loss": 0.7907, "step": 1587 }, { "epoch": 0.4370742448221289, "grad_norm": 0.27966116229705296, "learning_rate": 2.4282342961186705e-05, "loss": 0.7655, "step": 1588 }, { "epoch": 0.43734948049267186, "grad_norm": 0.2641961924186609, "learning_rate": 2.426527384970329e-05, "loss": 0.7959, "step": 1589 }, { "epoch": 0.43762471616321474, "grad_norm": 0.27668049741714845, "learning_rate": 2.424820148279803e-05, "loss": 0.7867, "step": 1590 }, { "epoch": 0.43789995183375763, "grad_norm": 0.2611167188967044, "learning_rate": 2.423112587350124e-05, "loss": 0.7984, "step": 1591 }, { "epoch": 0.43817518750430057, "grad_norm": 0.33357550981600415, "learning_rate": 2.4214047034845673e-05, "loss": 0.8253, "step": 1592 }, { "epoch": 0.43845042317484345, "grad_norm": 0.26710635956132567, "learning_rate": 2.419696497986656e-05, "loss": 0.7881, "step": 1593 }, { "epoch": 0.43872565884538633, "grad_norm": 0.2736293414899826, "learning_rate": 2.417987972160158e-05, "loss": 0.7675, "step": 1594 }, { "epoch": 0.4390008945159293, "grad_norm": 0.2941948694624388, "learning_rate": 2.4162791273090863e-05, "loss": 0.7713, "step": 1595 }, { "epoch": 0.43927613018647216, "grad_norm": 0.27180507918902364, "learning_rate": 2.414569964737698e-05, "loss": 0.8087, "step": 1596 }, { "epoch": 0.43955136585701504, "grad_norm": 0.32201452043854006, "learning_rate": 2.4128604857504923e-05, "loss": 0.8115, "step": 1597 }, { "epoch": 0.439826601527558, "grad_norm": 0.27019563722592305, "learning_rate": 2.4111506916522084e-05, "loss": 0.7925, "step": 1598 }, { "epoch": 0.44010183719810086, "grad_norm": 0.28629218746674434, "learning_rate": 2.409440583747828e-05, "loss": 0.798, "step": 1599 }, { "epoch": 0.44037707286864375, "grad_norm": 0.28224945076498836, "learning_rate": 2.4077301633425716e-05, "loss": 0.7882, "step": 1600 }, { "epoch": 0.4406523085391867, "grad_norm": 0.2700656948439867, "learning_rate": 2.4060194317418974e-05, "loss": 0.859, "step": 1601 }, { "epoch": 0.44092754420972957, "grad_norm": 0.29623434887566996, "learning_rate": 2.404308390251503e-05, "loss": 0.8176, "step": 1602 }, { "epoch": 0.4412027798802725, "grad_norm": 0.25995691825993167, "learning_rate": 2.4025970401773204e-05, "loss": 0.7734, "step": 1603 }, { "epoch": 0.4414780155508154, "grad_norm": 0.28578242404804854, "learning_rate": 2.4008853828255187e-05, "loss": 0.8247, "step": 1604 }, { "epoch": 0.4417532512213583, "grad_norm": 0.3291469264128354, "learning_rate": 2.399173419502501e-05, "loss": 0.8069, "step": 1605 }, { "epoch": 0.4420284868919012, "grad_norm": 0.3093473781894673, "learning_rate": 2.3974611515149032e-05, "loss": 0.7878, "step": 1606 }, { "epoch": 0.4423037225624441, "grad_norm": 0.2857840619139393, "learning_rate": 2.395748580169595e-05, "loss": 0.7971, "step": 1607 }, { "epoch": 0.442578958232987, "grad_norm": 0.33309283781537863, "learning_rate": 2.394035706773677e-05, "loss": 0.8074, "step": 1608 }, { "epoch": 0.4428541939035299, "grad_norm": 0.33075153702648236, "learning_rate": 2.39232253263448e-05, "loss": 0.7754, "step": 1609 }, { "epoch": 0.4431294295740728, "grad_norm": 0.27203771265724375, "learning_rate": 2.390609059059565e-05, "loss": 0.782, "step": 1610 }, { "epoch": 0.4434046652446157, "grad_norm": 0.33236891628353504, "learning_rate": 2.3888952873567216e-05, "loss": 0.7739, "step": 1611 }, { "epoch": 0.44367990091515863, "grad_norm": 0.29014067567314206, "learning_rate": 2.3871812188339653e-05, "loss": 0.7897, "step": 1612 }, { "epoch": 0.4439551365857015, "grad_norm": 0.3094146471792101, "learning_rate": 2.385466854799541e-05, "loss": 0.7758, "step": 1613 }, { "epoch": 0.4442303722562444, "grad_norm": 0.2973785749542664, "learning_rate": 2.3837521965619167e-05, "loss": 0.7878, "step": 1614 }, { "epoch": 0.44450560792678734, "grad_norm": 0.31623424827633717, "learning_rate": 2.382037245429786e-05, "loss": 0.8003, "step": 1615 }, { "epoch": 0.4447808435973302, "grad_norm": 0.310821037517096, "learning_rate": 2.3803220027120654e-05, "loss": 0.7984, "step": 1616 }, { "epoch": 0.4450560792678731, "grad_norm": 0.2857163022467033, "learning_rate": 2.378606469717896e-05, "loss": 0.7953, "step": 1617 }, { "epoch": 0.44533131493841605, "grad_norm": 0.31477276396196974, "learning_rate": 2.376890647756637e-05, "loss": 0.7805, "step": 1618 }, { "epoch": 0.44560655060895893, "grad_norm": 0.3108309726428149, "learning_rate": 2.3751745381378714e-05, "loss": 0.7957, "step": 1619 }, { "epoch": 0.4458817862795018, "grad_norm": 0.28791878950978966, "learning_rate": 2.3734581421713987e-05, "loss": 0.7979, "step": 1620 }, { "epoch": 0.44615702195004475, "grad_norm": 0.31005767280539925, "learning_rate": 2.3717414611672408e-05, "loss": 0.7829, "step": 1621 }, { "epoch": 0.44643225762058764, "grad_norm": 0.28154708408818874, "learning_rate": 2.370024496435634e-05, "loss": 0.7942, "step": 1622 }, { "epoch": 0.4467074932911305, "grad_norm": 0.3027781268228018, "learning_rate": 2.368307249287031e-05, "loss": 0.8059, "step": 1623 }, { "epoch": 0.44698272896167346, "grad_norm": 0.28151340227579136, "learning_rate": 2.366589721032103e-05, "loss": 0.8184, "step": 1624 }, { "epoch": 0.44725796463221634, "grad_norm": 0.3363786663035669, "learning_rate": 2.3648719129817335e-05, "loss": 0.79, "step": 1625 }, { "epoch": 0.4475332003027592, "grad_norm": 0.2750818479805928, "learning_rate": 2.363153826447019e-05, "loss": 0.7688, "step": 1626 }, { "epoch": 0.44780843597330217, "grad_norm": 0.31079540101572517, "learning_rate": 2.3614354627392703e-05, "loss": 0.7948, "step": 1627 }, { "epoch": 0.44808367164384505, "grad_norm": 0.2736270642653545, "learning_rate": 2.359716823170009e-05, "loss": 0.7741, "step": 1628 }, { "epoch": 0.44835890731438793, "grad_norm": 0.2938174781088623, "learning_rate": 2.3579979090509672e-05, "loss": 0.7932, "step": 1629 }, { "epoch": 0.4486341429849309, "grad_norm": 0.3075005581220249, "learning_rate": 2.3562787216940864e-05, "loss": 0.8294, "step": 1630 }, { "epoch": 0.44890937865547376, "grad_norm": 0.26738711635634516, "learning_rate": 2.3545592624115172e-05, "loss": 0.7724, "step": 1631 }, { "epoch": 0.44918461432601664, "grad_norm": 0.3026137091561077, "learning_rate": 2.3528395325156175e-05, "loss": 0.7943, "step": 1632 }, { "epoch": 0.4494598499965596, "grad_norm": 0.3535514366251364, "learning_rate": 2.3511195333189503e-05, "loss": 0.802, "step": 1633 }, { "epoch": 0.44973508566710246, "grad_norm": 0.30117982206851973, "learning_rate": 2.3493992661342865e-05, "loss": 0.8023, "step": 1634 }, { "epoch": 0.45001032133764535, "grad_norm": 0.2694164912698681, "learning_rate": 2.3476787322746007e-05, "loss": 0.7828, "step": 1635 }, { "epoch": 0.4502855570081883, "grad_norm": 0.2945971699512249, "learning_rate": 2.345957933053071e-05, "loss": 0.7731, "step": 1636 }, { "epoch": 0.45056079267873117, "grad_norm": 0.6140352459748996, "learning_rate": 2.3442368697830767e-05, "loss": 0.8232, "step": 1637 }, { "epoch": 0.45083602834927405, "grad_norm": 0.32155502499418237, "learning_rate": 2.3425155437782007e-05, "loss": 0.7794, "step": 1638 }, { "epoch": 0.451111264019817, "grad_norm": 0.2701455300552998, "learning_rate": 2.3407939563522248e-05, "loss": 0.7939, "step": 1639 }, { "epoch": 0.4513864996903599, "grad_norm": 0.26950129133550305, "learning_rate": 2.3390721088191322e-05, "loss": 0.8323, "step": 1640 }, { "epoch": 0.45166173536090276, "grad_norm": 0.2914499396388273, "learning_rate": 2.3373500024931025e-05, "loss": 0.7892, "step": 1641 }, { "epoch": 0.4519369710314457, "grad_norm": 0.27967733941718875, "learning_rate": 2.3356276386885144e-05, "loss": 0.8191, "step": 1642 }, { "epoch": 0.4522122067019886, "grad_norm": 0.2900091020222259, "learning_rate": 2.3339050187199423e-05, "loss": 0.7908, "step": 1643 }, { "epoch": 0.45248744237253147, "grad_norm": 0.28773498093485295, "learning_rate": 2.3321821439021556e-05, "loss": 0.8074, "step": 1644 }, { "epoch": 0.4527626780430744, "grad_norm": 0.45887861211448044, "learning_rate": 2.3304590155501198e-05, "loss": 0.7767, "step": 1645 }, { "epoch": 0.4530379137136173, "grad_norm": 0.3183033245742924, "learning_rate": 2.3287356349789936e-05, "loss": 0.816, "step": 1646 }, { "epoch": 0.4533131493841602, "grad_norm": 0.3175071359168492, "learning_rate": 2.327012003504127e-05, "loss": 0.8024, "step": 1647 }, { "epoch": 0.4535883850547031, "grad_norm": 0.2838076219406021, "learning_rate": 2.3252881224410612e-05, "loss": 0.7874, "step": 1648 }, { "epoch": 0.453863620725246, "grad_norm": 0.3208661583070452, "learning_rate": 2.32356399310553e-05, "loss": 0.8151, "step": 1649 }, { "epoch": 0.4541388563957889, "grad_norm": 0.2927301112340574, "learning_rate": 2.321839616813455e-05, "loss": 0.8261, "step": 1650 }, { "epoch": 0.4544140920663318, "grad_norm": 0.3057884616049347, "learning_rate": 2.3201149948809473e-05, "loss": 0.8097, "step": 1651 }, { "epoch": 0.4546893277368747, "grad_norm": 0.29642780321189954, "learning_rate": 2.3183901286243047e-05, "loss": 0.8077, "step": 1652 }, { "epoch": 0.4549645634074176, "grad_norm": 0.3064307972670116, "learning_rate": 2.3166650193600123e-05, "loss": 0.8146, "step": 1653 }, { "epoch": 0.4552397990779605, "grad_norm": 0.298599499435739, "learning_rate": 2.3149396684047397e-05, "loss": 0.782, "step": 1654 }, { "epoch": 0.4555150347485034, "grad_norm": 0.2609796353777113, "learning_rate": 2.313214077075341e-05, "loss": 0.8092, "step": 1655 }, { "epoch": 0.4557902704190463, "grad_norm": 0.2982766885059548, "learning_rate": 2.311488246688854e-05, "loss": 0.7951, "step": 1656 }, { "epoch": 0.45606550608958923, "grad_norm": 0.2882480134195979, "learning_rate": 2.309762178562501e-05, "loss": 0.7873, "step": 1657 }, { "epoch": 0.4563407417601321, "grad_norm": 0.3153400577453351, "learning_rate": 2.3080358740136822e-05, "loss": 0.7921, "step": 1658 }, { "epoch": 0.456615977430675, "grad_norm": 0.27485275779932977, "learning_rate": 2.3063093343599806e-05, "loss": 0.8, "step": 1659 }, { "epoch": 0.45689121310121794, "grad_norm": 0.2958132200803276, "learning_rate": 2.3045825609191578e-05, "loss": 0.7663, "step": 1660 }, { "epoch": 0.4571664487717608, "grad_norm": 0.27874225136860875, "learning_rate": 2.3028555550091536e-05, "loss": 0.8159, "step": 1661 }, { "epoch": 0.4574416844423037, "grad_norm": 0.30278994866904807, "learning_rate": 2.3011283179480862e-05, "loss": 0.7959, "step": 1662 }, { "epoch": 0.45771692011284665, "grad_norm": 0.2592295382331562, "learning_rate": 2.2994008510542498e-05, "loss": 0.7713, "step": 1663 }, { "epoch": 0.45799215578338953, "grad_norm": 0.30398413168142274, "learning_rate": 2.2976731556461135e-05, "loss": 0.783, "step": 1664 }, { "epoch": 0.4582673914539324, "grad_norm": 0.27671341614776185, "learning_rate": 2.2959452330423217e-05, "loss": 0.8502, "step": 1665 }, { "epoch": 0.45854262712447535, "grad_norm": 0.31941357339594073, "learning_rate": 2.2942170845616905e-05, "loss": 0.8339, "step": 1666 }, { "epoch": 0.45881786279501824, "grad_norm": 1.0728719259556911, "learning_rate": 2.2924887115232113e-05, "loss": 0.8286, "step": 1667 }, { "epoch": 0.4590930984655611, "grad_norm": 0.32142749336199716, "learning_rate": 2.2907601152460442e-05, "loss": 0.7874, "step": 1668 }, { "epoch": 0.45936833413610406, "grad_norm": 0.3430812451270998, "learning_rate": 2.289031297049521e-05, "loss": 0.7907, "step": 1669 }, { "epoch": 0.45964356980664695, "grad_norm": 0.33332169085431984, "learning_rate": 2.2873022582531412e-05, "loss": 0.786, "step": 1670 }, { "epoch": 0.45991880547718983, "grad_norm": 0.3186064264933752, "learning_rate": 2.2855730001765763e-05, "loss": 0.8062, "step": 1671 }, { "epoch": 0.46019404114773277, "grad_norm": 0.31253625370356675, "learning_rate": 2.2838435241396618e-05, "loss": 0.7908, "step": 1672 }, { "epoch": 0.46046927681827565, "grad_norm": 0.2969753207919644, "learning_rate": 2.2821138314624e-05, "loss": 0.8185, "step": 1673 }, { "epoch": 0.46074451248881854, "grad_norm": 0.3022616202337864, "learning_rate": 2.2803839234649604e-05, "loss": 0.8005, "step": 1674 }, { "epoch": 0.4610197481593615, "grad_norm": 0.3258353635434477, "learning_rate": 2.278653801467675e-05, "loss": 0.786, "step": 1675 }, { "epoch": 0.46129498382990436, "grad_norm": 0.27754542177239094, "learning_rate": 2.2769234667910394e-05, "loss": 0.805, "step": 1676 }, { "epoch": 0.46157021950044724, "grad_norm": 0.30896749485382285, "learning_rate": 2.2751929207557124e-05, "loss": 0.7995, "step": 1677 }, { "epoch": 0.4618454551709902, "grad_norm": 0.277123554364044, "learning_rate": 2.2734621646825145e-05, "loss": 0.7906, "step": 1678 }, { "epoch": 0.46212069084153307, "grad_norm": 0.36632232653377717, "learning_rate": 2.2717311998924237e-05, "loss": 0.7961, "step": 1679 }, { "epoch": 0.46239592651207595, "grad_norm": 0.2791989728634918, "learning_rate": 2.2700000277065805e-05, "loss": 0.7912, "step": 1680 }, { "epoch": 0.4626711621826189, "grad_norm": 0.29547976952313004, "learning_rate": 2.2682686494462822e-05, "loss": 0.8073, "step": 1681 }, { "epoch": 0.4629463978531618, "grad_norm": 0.29194813535287817, "learning_rate": 2.2665370664329834e-05, "loss": 0.7869, "step": 1682 }, { "epoch": 0.46322163352370466, "grad_norm": 0.3007751469987453, "learning_rate": 2.2648052799882953e-05, "loss": 0.7873, "step": 1683 }, { "epoch": 0.4634968691942476, "grad_norm": 0.4010424059498456, "learning_rate": 2.2630732914339836e-05, "loss": 0.8353, "step": 1684 }, { "epoch": 0.4637721048647905, "grad_norm": 0.3145067506452559, "learning_rate": 2.2613411020919704e-05, "loss": 0.8108, "step": 1685 }, { "epoch": 0.46404734053533336, "grad_norm": 0.2933089618615493, "learning_rate": 2.2596087132843287e-05, "loss": 0.8128, "step": 1686 }, { "epoch": 0.4643225762058763, "grad_norm": 0.28491725094157117, "learning_rate": 2.257876126333284e-05, "loss": 0.7935, "step": 1687 }, { "epoch": 0.4645978118764192, "grad_norm": 0.29896200376966015, "learning_rate": 2.256143342561214e-05, "loss": 0.8101, "step": 1688 }, { "epoch": 0.46487304754696207, "grad_norm": 0.3168832733933036, "learning_rate": 2.2544103632906465e-05, "loss": 0.8099, "step": 1689 }, { "epoch": 0.465148283217505, "grad_norm": 0.36920663455628144, "learning_rate": 2.252677189844259e-05, "loss": 0.7669, "step": 1690 }, { "epoch": 0.4654235188880479, "grad_norm": 0.4136014450183235, "learning_rate": 2.2509438235448748e-05, "loss": 0.7976, "step": 1691 }, { "epoch": 0.4656987545585908, "grad_norm": 0.3330953429975218, "learning_rate": 2.249210265715467e-05, "loss": 0.7925, "step": 1692 }, { "epoch": 0.4659739902291337, "grad_norm": 0.28640205388627815, "learning_rate": 2.2474765176791532e-05, "loss": 0.8072, "step": 1693 }, { "epoch": 0.4662492258996766, "grad_norm": 0.28838476177714323, "learning_rate": 2.2457425807591988e-05, "loss": 0.7727, "step": 1694 }, { "epoch": 0.4665244615702195, "grad_norm": 0.2787411327807749, "learning_rate": 2.2440084562790085e-05, "loss": 0.8043, "step": 1695 }, { "epoch": 0.4667996972407624, "grad_norm": 0.28745042227663387, "learning_rate": 2.242274145562136e-05, "loss": 0.7948, "step": 1696 }, { "epoch": 0.4670749329113053, "grad_norm": 0.27861867576324845, "learning_rate": 2.2405396499322727e-05, "loss": 0.7987, "step": 1697 }, { "epoch": 0.4673501685818482, "grad_norm": 0.2670184013285605, "learning_rate": 2.2388049707132527e-05, "loss": 0.7943, "step": 1698 }, { "epoch": 0.46762540425239113, "grad_norm": 0.2930704355624698, "learning_rate": 2.2370701092290506e-05, "loss": 0.7938, "step": 1699 }, { "epoch": 0.467900639922934, "grad_norm": 0.2721421931599048, "learning_rate": 2.23533506680378e-05, "loss": 0.811, "step": 1700 }, { "epoch": 0.4681758755934769, "grad_norm": 0.2698915180238021, "learning_rate": 2.2335998447616918e-05, "loss": 0.7921, "step": 1701 }, { "epoch": 0.46845111126401984, "grad_norm": 0.3572038292718193, "learning_rate": 2.2318644444271746e-05, "loss": 0.7936, "step": 1702 }, { "epoch": 0.4687263469345627, "grad_norm": 0.2798386134482875, "learning_rate": 2.2301288671247532e-05, "loss": 0.8357, "step": 1703 }, { "epoch": 0.4690015826051056, "grad_norm": 0.2776607747426227, "learning_rate": 2.228393114179087e-05, "loss": 0.8117, "step": 1704 }, { "epoch": 0.46927681827564854, "grad_norm": 0.3206463207137854, "learning_rate": 2.2266571869149698e-05, "loss": 0.7891, "step": 1705 }, { "epoch": 0.46955205394619143, "grad_norm": 0.27957116412654204, "learning_rate": 2.2249210866573287e-05, "loss": 0.7742, "step": 1706 }, { "epoch": 0.4698272896167343, "grad_norm": 0.3344496998668106, "learning_rate": 2.2231848147312224e-05, "loss": 0.8049, "step": 1707 }, { "epoch": 0.47010252528727725, "grad_norm": 0.29992518479139924, "learning_rate": 2.2214483724618406e-05, "loss": 0.7837, "step": 1708 }, { "epoch": 0.47037776095782013, "grad_norm": 0.29207296727357596, "learning_rate": 2.2197117611745024e-05, "loss": 0.7987, "step": 1709 }, { "epoch": 0.470652996628363, "grad_norm": 0.3089860093733482, "learning_rate": 2.217974982194658e-05, "loss": 0.7949, "step": 1710 }, { "epoch": 0.47092823229890596, "grad_norm": 0.3000964759823666, "learning_rate": 2.2162380368478836e-05, "loss": 0.7441, "step": 1711 }, { "epoch": 0.47120346796944884, "grad_norm": 0.31947505972086276, "learning_rate": 2.214500926459883e-05, "loss": 0.819, "step": 1712 }, { "epoch": 0.4714787036399917, "grad_norm": 0.25563854247524825, "learning_rate": 2.212763652356486e-05, "loss": 0.7923, "step": 1713 }, { "epoch": 0.47175393931053466, "grad_norm": 0.3388312748649513, "learning_rate": 2.2110262158636474e-05, "loss": 0.7942, "step": 1714 }, { "epoch": 0.47202917498107755, "grad_norm": 0.2649085883782548, "learning_rate": 2.2092886183074464e-05, "loss": 0.7988, "step": 1715 }, { "epoch": 0.47230441065162043, "grad_norm": 0.29774010886809854, "learning_rate": 2.2075508610140828e-05, "loss": 0.7762, "step": 1716 }, { "epoch": 0.47257964632216337, "grad_norm": 0.2837838715013209, "learning_rate": 2.2058129453098826e-05, "loss": 0.806, "step": 1717 }, { "epoch": 0.47285488199270626, "grad_norm": 0.2728762648799962, "learning_rate": 2.204074872521288e-05, "loss": 0.8215, "step": 1718 }, { "epoch": 0.47313011766324914, "grad_norm": 0.2710413956835945, "learning_rate": 2.2023366439748647e-05, "loss": 0.8194, "step": 1719 }, { "epoch": 0.4734053533337921, "grad_norm": 0.5573222196343124, "learning_rate": 2.2005982609972952e-05, "loss": 0.786, "step": 1720 }, { "epoch": 0.47368058900433496, "grad_norm": 0.2731160307935018, "learning_rate": 2.1988597249153813e-05, "loss": 0.7878, "step": 1721 }, { "epoch": 0.47395582467487785, "grad_norm": 0.30399814931812263, "learning_rate": 2.1971210370560402e-05, "loss": 0.7796, "step": 1722 }, { "epoch": 0.4742310603454208, "grad_norm": 0.2608241851456652, "learning_rate": 2.1953821987463062e-05, "loss": 0.7937, "step": 1723 }, { "epoch": 0.47450629601596367, "grad_norm": 0.2931588669157855, "learning_rate": 2.193643211313327e-05, "loss": 0.7971, "step": 1724 }, { "epoch": 0.47478153168650655, "grad_norm": 0.26117651761114935, "learning_rate": 2.1919040760843663e-05, "loss": 0.7802, "step": 1725 }, { "epoch": 0.4750567673570495, "grad_norm": 0.2813837478909038, "learning_rate": 2.1901647943867986e-05, "loss": 0.7991, "step": 1726 }, { "epoch": 0.4753320030275924, "grad_norm": 0.27209640929023815, "learning_rate": 2.188425367548111e-05, "loss": 0.8, "step": 1727 }, { "epoch": 0.47560723869813526, "grad_norm": 0.2664957237643973, "learning_rate": 2.186685796895901e-05, "loss": 0.8048, "step": 1728 }, { "epoch": 0.4758824743686782, "grad_norm": 0.2765271449471321, "learning_rate": 2.1849460837578767e-05, "loss": 0.7783, "step": 1729 }, { "epoch": 0.4761577100392211, "grad_norm": 0.26359419929274464, "learning_rate": 2.183206229461854e-05, "loss": 0.7907, "step": 1730 }, { "epoch": 0.47643294570976397, "grad_norm": 0.2728067104523246, "learning_rate": 2.1814662353357567e-05, "loss": 0.7896, "step": 1731 }, { "epoch": 0.4767081813803069, "grad_norm": 0.2770972625384237, "learning_rate": 2.1797261027076166e-05, "loss": 0.7618, "step": 1732 }, { "epoch": 0.4769834170508498, "grad_norm": 0.27716393138190776, "learning_rate": 2.1779858329055688e-05, "loss": 0.8056, "step": 1733 }, { "epoch": 0.4772586527213927, "grad_norm": 0.7210957908286135, "learning_rate": 2.176245427257855e-05, "loss": 0.837, "step": 1734 }, { "epoch": 0.4775338883919356, "grad_norm": 0.2722418496214004, "learning_rate": 2.1745048870928208e-05, "loss": 0.7975, "step": 1735 }, { "epoch": 0.4778091240624785, "grad_norm": 0.2627307363683731, "learning_rate": 2.1727642137389124e-05, "loss": 0.7886, "step": 1736 }, { "epoch": 0.4780843597330214, "grad_norm": 0.28372534817140965, "learning_rate": 2.17102340852468e-05, "loss": 0.759, "step": 1737 }, { "epoch": 0.4783595954035643, "grad_norm": 0.26512671637247087, "learning_rate": 2.1692824727787736e-05, "loss": 0.771, "step": 1738 }, { "epoch": 0.4786348310741072, "grad_norm": 0.28252988096499726, "learning_rate": 2.1675414078299418e-05, "loss": 0.8153, "step": 1739 }, { "epoch": 0.4789100667446501, "grad_norm": 0.28314757859677153, "learning_rate": 2.1658002150070332e-05, "loss": 0.7748, "step": 1740 }, { "epoch": 0.479185302415193, "grad_norm": 0.27183430887823945, "learning_rate": 2.1640588956389923e-05, "loss": 0.7949, "step": 1741 }, { "epoch": 0.4794605380857359, "grad_norm": 0.3077901926897006, "learning_rate": 2.1623174510548627e-05, "loss": 0.7766, "step": 1742 }, { "epoch": 0.4797357737562788, "grad_norm": 0.27760037753894373, "learning_rate": 2.160575882583782e-05, "loss": 0.8078, "step": 1743 }, { "epoch": 0.48001100942682173, "grad_norm": 0.293236444387804, "learning_rate": 2.1588341915549825e-05, "loss": 0.7932, "step": 1744 }, { "epoch": 0.4802862450973646, "grad_norm": 0.30811500300258376, "learning_rate": 2.1570923792977893e-05, "loss": 0.8057, "step": 1745 }, { "epoch": 0.4805614807679075, "grad_norm": 0.2783070230175767, "learning_rate": 2.155350447141622e-05, "loss": 0.8013, "step": 1746 }, { "epoch": 0.48083671643845044, "grad_norm": 0.2572646507800091, "learning_rate": 2.1536083964159893e-05, "loss": 0.789, "step": 1747 }, { "epoch": 0.4811119521089933, "grad_norm": 0.28290675903026463, "learning_rate": 2.1518662284504927e-05, "loss": 0.798, "step": 1748 }, { "epoch": 0.4813871877795362, "grad_norm": 0.2758544840675627, "learning_rate": 2.150123944574822e-05, "loss": 0.7961, "step": 1749 }, { "epoch": 0.48166242345007915, "grad_norm": 0.628865638924377, "learning_rate": 2.1483815461187553e-05, "loss": 0.7901, "step": 1750 }, { "epoch": 0.48193765912062203, "grad_norm": 0.2707563624141069, "learning_rate": 2.1466390344121583e-05, "loss": 0.8124, "step": 1751 }, { "epoch": 0.4822128947911649, "grad_norm": 0.2831957978634998, "learning_rate": 2.1448964107849828e-05, "loss": 0.7904, "step": 1752 }, { "epoch": 0.48248813046170785, "grad_norm": 0.29371461458299014, "learning_rate": 2.1431536765672676e-05, "loss": 0.7907, "step": 1753 }, { "epoch": 0.48276336613225074, "grad_norm": 0.2581621035177647, "learning_rate": 2.1414108330891348e-05, "loss": 0.7765, "step": 1754 }, { "epoch": 0.4830386018027936, "grad_norm": 0.2814056634036065, "learning_rate": 2.139667881680789e-05, "loss": 0.8158, "step": 1755 }, { "epoch": 0.48331383747333656, "grad_norm": 0.2758666530494281, "learning_rate": 2.137924823672518e-05, "loss": 0.7859, "step": 1756 }, { "epoch": 0.48358907314387944, "grad_norm": 0.39000091763762096, "learning_rate": 2.1361816603946922e-05, "loss": 0.7759, "step": 1757 }, { "epoch": 0.48386430881442233, "grad_norm": 0.29037363845582215, "learning_rate": 2.1344383931777606e-05, "loss": 0.792, "step": 1758 }, { "epoch": 0.48413954448496527, "grad_norm": 0.38418032710709565, "learning_rate": 2.1326950233522515e-05, "loss": 0.7993, "step": 1759 }, { "epoch": 0.48441478015550815, "grad_norm": 0.29204665923332523, "learning_rate": 2.130951552248773e-05, "loss": 0.7665, "step": 1760 }, { "epoch": 0.48469001582605103, "grad_norm": 0.291882163067355, "learning_rate": 2.1292079811980093e-05, "loss": 0.7819, "step": 1761 }, { "epoch": 0.484965251496594, "grad_norm": 0.28631367096112953, "learning_rate": 2.1274643115307207e-05, "loss": 0.7981, "step": 1762 }, { "epoch": 0.48524048716713686, "grad_norm": 0.28768312205681207, "learning_rate": 2.125720544577744e-05, "loss": 0.798, "step": 1763 }, { "epoch": 0.48551572283767974, "grad_norm": 0.34242076178983794, "learning_rate": 2.1239766816699894e-05, "loss": 0.7956, "step": 1764 }, { "epoch": 0.4857909585082227, "grad_norm": 0.2854851432802041, "learning_rate": 2.12223272413844e-05, "loss": 0.8174, "step": 1765 }, { "epoch": 0.48606619417876556, "grad_norm": 0.26540351697584436, "learning_rate": 2.120488673314152e-05, "loss": 0.7867, "step": 1766 }, { "epoch": 0.48634142984930845, "grad_norm": 0.2907226629348622, "learning_rate": 2.1187445305282525e-05, "loss": 0.8248, "step": 1767 }, { "epoch": 0.4866166655198514, "grad_norm": 0.2698162490585244, "learning_rate": 2.117000297111938e-05, "loss": 0.8054, "step": 1768 }, { "epoch": 0.48689190119039427, "grad_norm": 0.269232138249288, "learning_rate": 2.115255974396476e-05, "loss": 0.7755, "step": 1769 }, { "epoch": 0.48716713686093716, "grad_norm": 0.2807591574601917, "learning_rate": 2.1135115637131994e-05, "loss": 0.7997, "step": 1770 }, { "epoch": 0.4874423725314801, "grad_norm": 0.2770987432672441, "learning_rate": 2.1117670663935118e-05, "loss": 0.778, "step": 1771 }, { "epoch": 0.487717608202023, "grad_norm": 0.2621201805827772, "learning_rate": 2.1100224837688792e-05, "loss": 0.7624, "step": 1772 }, { "epoch": 0.48799284387256586, "grad_norm": 0.29584262114495097, "learning_rate": 2.1082778171708355e-05, "loss": 0.7917, "step": 1773 }, { "epoch": 0.4882680795431088, "grad_norm": 0.28810584622906893, "learning_rate": 2.1065330679309766e-05, "loss": 0.8017, "step": 1774 }, { "epoch": 0.4885433152136517, "grad_norm": 0.3037144161798492, "learning_rate": 2.1047882373809646e-05, "loss": 0.7912, "step": 1775 }, { "epoch": 0.48881855088419457, "grad_norm": 0.3487331345848116, "learning_rate": 2.10304332685252e-05, "loss": 0.7938, "step": 1776 }, { "epoch": 0.4890937865547375, "grad_norm": 0.29140261836006287, "learning_rate": 2.1012983376774255e-05, "loss": 0.7831, "step": 1777 }, { "epoch": 0.4893690222252804, "grad_norm": 0.31938290483864246, "learning_rate": 2.099553271187526e-05, "loss": 0.7517, "step": 1778 }, { "epoch": 0.4896442578958233, "grad_norm": 0.30007021053547217, "learning_rate": 2.0978081287147218e-05, "loss": 0.7896, "step": 1779 }, { "epoch": 0.4899194935663662, "grad_norm": 0.2546187942290934, "learning_rate": 2.0960629115909743e-05, "loss": 0.7926, "step": 1780 }, { "epoch": 0.4901947292369091, "grad_norm": 0.30089950412051, "learning_rate": 2.0943176211483013e-05, "loss": 0.7838, "step": 1781 }, { "epoch": 0.490469964907452, "grad_norm": 0.30372815830362443, "learning_rate": 2.092572258718774e-05, "loss": 0.7852, "step": 1782 }, { "epoch": 0.4907452005779949, "grad_norm": 0.2836246346667227, "learning_rate": 2.090826825634522e-05, "loss": 0.7827, "step": 1783 }, { "epoch": 0.4910204362485378, "grad_norm": 0.28047859446672074, "learning_rate": 2.0890813232277263e-05, "loss": 0.7895, "step": 1784 }, { "epoch": 0.4912956719190807, "grad_norm": 0.28040166068412964, "learning_rate": 2.087335752830622e-05, "loss": 0.7763, "step": 1785 }, { "epoch": 0.49157090758962363, "grad_norm": 0.4580865112030622, "learning_rate": 2.0855901157754964e-05, "loss": 0.8046, "step": 1786 }, { "epoch": 0.4918461432601665, "grad_norm": 0.3264974327831298, "learning_rate": 2.0838444133946867e-05, "loss": 0.8223, "step": 1787 }, { "epoch": 0.4921213789307094, "grad_norm": 0.2669283921793564, "learning_rate": 2.0820986470205805e-05, "loss": 0.7801, "step": 1788 }, { "epoch": 0.49239661460125234, "grad_norm": 0.47090998360415265, "learning_rate": 2.0803528179856145e-05, "loss": 0.8139, "step": 1789 }, { "epoch": 0.4926718502717952, "grad_norm": 0.29972794899024474, "learning_rate": 2.0786069276222722e-05, "loss": 0.8035, "step": 1790 }, { "epoch": 0.4929470859423381, "grad_norm": 0.2972130902539023, "learning_rate": 2.076860977263085e-05, "loss": 0.7858, "step": 1791 }, { "epoch": 0.49322232161288104, "grad_norm": 0.28561626421570363, "learning_rate": 2.0751149682406303e-05, "loss": 0.7854, "step": 1792 }, { "epoch": 0.4934975572834239, "grad_norm": 0.2955980744524161, "learning_rate": 2.073368901887529e-05, "loss": 0.7527, "step": 1793 }, { "epoch": 0.4937727929539668, "grad_norm": 0.4196436861149892, "learning_rate": 2.071622779536446e-05, "loss": 0.8101, "step": 1794 }, { "epoch": 0.49404802862450975, "grad_norm": 0.2970753345608763, "learning_rate": 2.0698766025200897e-05, "loss": 0.8199, "step": 1795 }, { "epoch": 0.49432326429505263, "grad_norm": 0.27878974946916274, "learning_rate": 2.0681303721712105e-05, "loss": 0.8113, "step": 1796 }, { "epoch": 0.4945984999655955, "grad_norm": 0.32790368278803866, "learning_rate": 2.0663840898225982e-05, "loss": 0.7836, "step": 1797 }, { "epoch": 0.49487373563613846, "grad_norm": 0.2867737693561296, "learning_rate": 2.064637756807083e-05, "loss": 0.8134, "step": 1798 }, { "epoch": 0.49514897130668134, "grad_norm": 0.32467707157300846, "learning_rate": 2.0628913744575344e-05, "loss": 0.7824, "step": 1799 }, { "epoch": 0.4954242069772242, "grad_norm": 0.29139845858453167, "learning_rate": 2.061144944106858e-05, "loss": 0.8198, "step": 1800 }, { "epoch": 0.49569944264776716, "grad_norm": 0.3761268661384045, "learning_rate": 2.0593984670879973e-05, "loss": 0.7907, "step": 1801 }, { "epoch": 0.49597467831831005, "grad_norm": 0.2752478707286451, "learning_rate": 2.0576519447339313e-05, "loss": 0.8013, "step": 1802 }, { "epoch": 0.49624991398885293, "grad_norm": 0.30508824765776554, "learning_rate": 2.055905378377673e-05, "loss": 0.8013, "step": 1803 }, { "epoch": 0.49652514965939587, "grad_norm": 0.24694277564438605, "learning_rate": 2.0541587693522694e-05, "loss": 0.7752, "step": 1804 }, { "epoch": 0.49680038532993875, "grad_norm": 0.35293258669054917, "learning_rate": 2.0524121189908e-05, "loss": 0.7877, "step": 1805 }, { "epoch": 0.49707562100048164, "grad_norm": 0.2615144415699339, "learning_rate": 2.050665428626376e-05, "loss": 0.7906, "step": 1806 }, { "epoch": 0.4973508566710246, "grad_norm": 0.26741314910235753, "learning_rate": 2.0489186995921392e-05, "loss": 0.7659, "step": 1807 }, { "epoch": 0.49762609234156746, "grad_norm": 0.27073768541859894, "learning_rate": 2.0471719332212605e-05, "loss": 0.8053, "step": 1808 }, { "epoch": 0.49790132801211034, "grad_norm": 0.25624827625159563, "learning_rate": 2.045425130846939e-05, "loss": 0.7721, "step": 1809 }, { "epoch": 0.4981765636826533, "grad_norm": 0.27467751612423486, "learning_rate": 2.0436782938024023e-05, "loss": 0.7971, "step": 1810 }, { "epoch": 0.49845179935319617, "grad_norm": 0.2540227526578231, "learning_rate": 2.041931423420904e-05, "loss": 0.7702, "step": 1811 }, { "epoch": 0.49872703502373905, "grad_norm": 0.2537609456445603, "learning_rate": 2.0401845210357222e-05, "loss": 0.8158, "step": 1812 }, { "epoch": 0.499002270694282, "grad_norm": 0.2553053297171385, "learning_rate": 2.0384375879801622e-05, "loss": 0.7945, "step": 1813 }, { "epoch": 0.4992775063648249, "grad_norm": 0.23718379217472482, "learning_rate": 2.036690625587549e-05, "loss": 0.7967, "step": 1814 }, { "epoch": 0.49955274203536776, "grad_norm": 0.26219879330390655, "learning_rate": 2.0349436351912327e-05, "loss": 0.8149, "step": 1815 }, { "epoch": 0.4998279777059107, "grad_norm": 0.26072196556066396, "learning_rate": 2.0331966181245835e-05, "loss": 0.7824, "step": 1816 }, { "epoch": 0.5001032133764536, "grad_norm": 0.236613647680266, "learning_rate": 2.031449575720992e-05, "loss": 0.7812, "step": 1817 }, { "epoch": 0.5003784490469965, "grad_norm": 0.27423852596815435, "learning_rate": 2.0297025093138697e-05, "loss": 0.7727, "step": 1818 }, { "epoch": 0.5006536847175393, "grad_norm": 0.5283423097781088, "learning_rate": 2.0279554202366443e-05, "loss": 0.7747, "step": 1819 }, { "epoch": 0.5009289203880823, "grad_norm": 0.2660421101896664, "learning_rate": 2.026208309822762e-05, "loss": 0.7889, "step": 1820 }, { "epoch": 0.5012041560586252, "grad_norm": 0.25963909382849104, "learning_rate": 2.0244611794056846e-05, "loss": 0.794, "step": 1821 }, { "epoch": 0.5014793917291681, "grad_norm": 0.2871750017668787, "learning_rate": 2.0227140303188895e-05, "loss": 0.789, "step": 1822 }, { "epoch": 0.501754627399711, "grad_norm": 0.27434033789371726, "learning_rate": 2.0209668638958687e-05, "loss": 0.7897, "step": 1823 }, { "epoch": 0.5020298630702539, "grad_norm": 0.2794859206744288, "learning_rate": 2.0192196814701278e-05, "loss": 0.8211, "step": 1824 }, { "epoch": 0.5023050987407968, "grad_norm": 0.27023359462501895, "learning_rate": 2.0174724843751824e-05, "loss": 0.7968, "step": 1825 }, { "epoch": 0.5025803344113398, "grad_norm": 0.3088651290159606, "learning_rate": 2.0157252739445624e-05, "loss": 0.7835, "step": 1826 }, { "epoch": 0.5028555700818826, "grad_norm": 0.2523274812488868, "learning_rate": 2.0139780515118054e-05, "loss": 0.7642, "step": 1827 }, { "epoch": 0.5031308057524255, "grad_norm": 0.2901158820326341, "learning_rate": 2.0122308184104587e-05, "loss": 0.7728, "step": 1828 }, { "epoch": 0.5034060414229684, "grad_norm": 0.2656362348103561, "learning_rate": 2.0104835759740798e-05, "loss": 0.8049, "step": 1829 }, { "epoch": 0.5036812770935113, "grad_norm": 0.3040262021086047, "learning_rate": 2.00873632553623e-05, "loss": 0.7752, "step": 1830 }, { "epoch": 0.5039565127640542, "grad_norm": 0.33692564783429974, "learning_rate": 2.006989068430479e-05, "loss": 0.782, "step": 1831 }, { "epoch": 0.5042317484345972, "grad_norm": 0.2838371097622475, "learning_rate": 2.005241805990401e-05, "loss": 0.783, "step": 1832 }, { "epoch": 0.50450698410514, "grad_norm": 0.28443192939303713, "learning_rate": 2.003494539549574e-05, "loss": 0.8035, "step": 1833 }, { "epoch": 0.5047822197756829, "grad_norm": 0.2793398356762985, "learning_rate": 2.001747270441579e-05, "loss": 0.7697, "step": 1834 }, { "epoch": 0.5050574554462258, "grad_norm": 0.27926091910752626, "learning_rate": 2e-05, "loss": 0.7907, "step": 1835 }, { "epoch": 0.5053326911167687, "grad_norm": 0.2899739453078647, "learning_rate": 1.9982527295584217e-05, "loss": 0.7845, "step": 1836 }, { "epoch": 0.5056079267873116, "grad_norm": 0.2760882542671676, "learning_rate": 1.996505460450427e-05, "loss": 0.7749, "step": 1837 }, { "epoch": 0.5058831624578546, "grad_norm": 0.2930290348349952, "learning_rate": 1.9947581940096e-05, "loss": 0.7759, "step": 1838 }, { "epoch": 0.5061583981283975, "grad_norm": 0.29413520625087847, "learning_rate": 1.9930109315695212e-05, "loss": 0.8076, "step": 1839 }, { "epoch": 0.5064336337989404, "grad_norm": 0.2965867782023049, "learning_rate": 1.9912636744637704e-05, "loss": 0.8134, "step": 1840 }, { "epoch": 0.5067088694694832, "grad_norm": 0.2726351152200352, "learning_rate": 1.989516424025921e-05, "loss": 0.7884, "step": 1841 }, { "epoch": 0.5069841051400261, "grad_norm": 0.5284990385916277, "learning_rate": 1.9877691815895416e-05, "loss": 0.7711, "step": 1842 }, { "epoch": 0.507259340810569, "grad_norm": 0.31078040704691867, "learning_rate": 1.9860219484881953e-05, "loss": 0.8002, "step": 1843 }, { "epoch": 0.507534576481112, "grad_norm": 0.274453626099893, "learning_rate": 1.9842747260554383e-05, "loss": 0.7682, "step": 1844 }, { "epoch": 0.5078098121516549, "grad_norm": 0.5039990309141663, "learning_rate": 1.9825275156248183e-05, "loss": 0.8001, "step": 1845 }, { "epoch": 0.5080850478221978, "grad_norm": 0.26663518393366115, "learning_rate": 1.9807803185298725e-05, "loss": 0.8125, "step": 1846 }, { "epoch": 0.5083602834927406, "grad_norm": 0.3302154261670141, "learning_rate": 1.9790331361041316e-05, "loss": 0.8097, "step": 1847 }, { "epoch": 0.5086355191632835, "grad_norm": 0.2820575014419362, "learning_rate": 1.977285969681111e-05, "loss": 0.791, "step": 1848 }, { "epoch": 0.5089107548338264, "grad_norm": 0.30828900340014714, "learning_rate": 1.975538820594316e-05, "loss": 0.8212, "step": 1849 }, { "epoch": 0.5091859905043694, "grad_norm": 0.27770905907922044, "learning_rate": 1.9737916901772387e-05, "loss": 0.7995, "step": 1850 }, { "epoch": 0.5094612261749123, "grad_norm": 0.4189477872834542, "learning_rate": 1.9720445797633564e-05, "loss": 0.7752, "step": 1851 }, { "epoch": 0.5097364618454552, "grad_norm": 0.27017071599393705, "learning_rate": 1.9702974906861313e-05, "loss": 0.8072, "step": 1852 }, { "epoch": 0.5100116975159981, "grad_norm": 0.32253948520203274, "learning_rate": 1.968550424279008e-05, "loss": 0.7607, "step": 1853 }, { "epoch": 0.510286933186541, "grad_norm": 0.2849398772803456, "learning_rate": 1.9668033818754172e-05, "loss": 0.7822, "step": 1854 }, { "epoch": 0.5105621688570838, "grad_norm": 0.30576670900428615, "learning_rate": 1.9650563648087676e-05, "loss": 0.776, "step": 1855 }, { "epoch": 0.5108374045276268, "grad_norm": 0.3059638528133474, "learning_rate": 1.9633093744124513e-05, "loss": 0.7778, "step": 1856 }, { "epoch": 0.5111126401981697, "grad_norm": 0.2853091596695262, "learning_rate": 1.9615624120198385e-05, "loss": 0.7879, "step": 1857 }, { "epoch": 0.5113878758687126, "grad_norm": 0.279440207179744, "learning_rate": 1.959815478964278e-05, "loss": 0.7934, "step": 1858 }, { "epoch": 0.5116631115392555, "grad_norm": 0.26715188895634223, "learning_rate": 1.9580685765790967e-05, "loss": 0.7663, "step": 1859 }, { "epoch": 0.5119383472097984, "grad_norm": 0.26912141118388283, "learning_rate": 1.956321706197598e-05, "loss": 0.7929, "step": 1860 }, { "epoch": 0.5122135828803412, "grad_norm": 0.25812474718831835, "learning_rate": 1.9545748691530613e-05, "loss": 0.7892, "step": 1861 }, { "epoch": 0.5124888185508842, "grad_norm": 0.2782469711985159, "learning_rate": 1.9528280667787402e-05, "loss": 0.8091, "step": 1862 }, { "epoch": 0.5127640542214271, "grad_norm": 0.2855279171052471, "learning_rate": 1.9510813004078615e-05, "loss": 0.8117, "step": 1863 }, { "epoch": 0.51303928989197, "grad_norm": 0.28253600322665207, "learning_rate": 1.9493345713736248e-05, "loss": 0.8074, "step": 1864 }, { "epoch": 0.5133145255625129, "grad_norm": 0.28782847388424193, "learning_rate": 1.9475878810092005e-05, "loss": 0.7919, "step": 1865 }, { "epoch": 0.5135897612330558, "grad_norm": 0.27136792881072175, "learning_rate": 1.9458412306477316e-05, "loss": 0.8043, "step": 1866 }, { "epoch": 0.5138649969035987, "grad_norm": 0.29449075942078307, "learning_rate": 1.944094621622328e-05, "loss": 0.76, "step": 1867 }, { "epoch": 0.5141402325741417, "grad_norm": 0.25669349944292563, "learning_rate": 1.942348055266069e-05, "loss": 0.7584, "step": 1868 }, { "epoch": 0.5144154682446845, "grad_norm": 0.26624978552777906, "learning_rate": 1.940601532912003e-05, "loss": 0.7965, "step": 1869 }, { "epoch": 0.5146907039152274, "grad_norm": 0.26487169146946327, "learning_rate": 1.938855055893143e-05, "loss": 0.7862, "step": 1870 }, { "epoch": 0.5149659395857703, "grad_norm": 0.2638987307765772, "learning_rate": 1.9371086255424662e-05, "loss": 0.786, "step": 1871 }, { "epoch": 0.5152411752563132, "grad_norm": 0.25559559372955387, "learning_rate": 1.9353622431929175e-05, "loss": 0.7935, "step": 1872 }, { "epoch": 0.5155164109268561, "grad_norm": 0.26630601315009644, "learning_rate": 1.9336159101774025e-05, "loss": 0.7826, "step": 1873 }, { "epoch": 0.5157916465973991, "grad_norm": 0.2660509295352382, "learning_rate": 1.9318696278287905e-05, "loss": 0.7878, "step": 1874 }, { "epoch": 0.516066882267942, "grad_norm": 0.2615994462412795, "learning_rate": 1.9301233974799107e-05, "loss": 0.7931, "step": 1875 }, { "epoch": 0.5163421179384848, "grad_norm": 0.2729844686108098, "learning_rate": 1.9283772204635544e-05, "loss": 0.8023, "step": 1876 }, { "epoch": 0.5166173536090277, "grad_norm": 0.31472095061773553, "learning_rate": 1.9266310981124717e-05, "loss": 0.8158, "step": 1877 }, { "epoch": 0.5168925892795706, "grad_norm": 0.2829747043779742, "learning_rate": 1.92488503175937e-05, "loss": 0.7757, "step": 1878 }, { "epoch": 0.5171678249501135, "grad_norm": 0.266646264944014, "learning_rate": 1.9231390227369152e-05, "loss": 0.8025, "step": 1879 }, { "epoch": 0.5174430606206565, "grad_norm": 0.25708171952330294, "learning_rate": 1.9213930723777285e-05, "loss": 0.7672, "step": 1880 }, { "epoch": 0.5177182962911994, "grad_norm": 0.2856031088074033, "learning_rate": 1.919647182014386e-05, "loss": 0.7851, "step": 1881 }, { "epoch": 0.5179935319617422, "grad_norm": 0.250364937205058, "learning_rate": 1.9179013529794195e-05, "loss": 0.8055, "step": 1882 }, { "epoch": 0.5182687676322851, "grad_norm": 0.26899840968706573, "learning_rate": 1.9161555866053136e-05, "loss": 0.755, "step": 1883 }, { "epoch": 0.518544003302828, "grad_norm": 0.25350280903092137, "learning_rate": 1.9144098842245042e-05, "loss": 0.7899, "step": 1884 }, { "epoch": 0.5188192389733709, "grad_norm": 0.27039801347560255, "learning_rate": 1.912664247169379e-05, "loss": 0.7617, "step": 1885 }, { "epoch": 0.5190944746439139, "grad_norm": 0.26826753895162614, "learning_rate": 1.9109186767722743e-05, "loss": 0.7804, "step": 1886 }, { "epoch": 0.5193697103144568, "grad_norm": 0.25225340441463456, "learning_rate": 1.9091731743654792e-05, "loss": 0.7799, "step": 1887 }, { "epoch": 0.5196449459849997, "grad_norm": 0.2712241046085995, "learning_rate": 1.907427741281227e-05, "loss": 0.7956, "step": 1888 }, { "epoch": 0.5199201816555425, "grad_norm": 0.261010355273269, "learning_rate": 1.905682378851699e-05, "loss": 0.7806, "step": 1889 }, { "epoch": 0.5201954173260854, "grad_norm": 0.27913054691319983, "learning_rate": 1.9039370884090256e-05, "loss": 0.7827, "step": 1890 }, { "epoch": 0.5204706529966283, "grad_norm": 0.26569515334185306, "learning_rate": 1.9021918712852785e-05, "loss": 0.7793, "step": 1891 }, { "epoch": 0.5207458886671713, "grad_norm": 0.25150170325041604, "learning_rate": 1.9004467288124746e-05, "loss": 0.7626, "step": 1892 }, { "epoch": 0.5210211243377142, "grad_norm": 0.2660344293365876, "learning_rate": 1.8987016623225748e-05, "loss": 0.7686, "step": 1893 }, { "epoch": 0.5212963600082571, "grad_norm": 0.2713633212540108, "learning_rate": 1.896956673147481e-05, "loss": 0.7753, "step": 1894 }, { "epoch": 0.5215715956788, "grad_norm": 0.260961251206183, "learning_rate": 1.8952117626190364e-05, "loss": 0.7677, "step": 1895 }, { "epoch": 0.5218468313493428, "grad_norm": 0.27477723765459183, "learning_rate": 1.893466932069023e-05, "loss": 0.7499, "step": 1896 }, { "epoch": 0.5221220670198857, "grad_norm": 0.25867130600828864, "learning_rate": 1.8917221828291652e-05, "loss": 0.8165, "step": 1897 }, { "epoch": 0.5223973026904287, "grad_norm": 0.28667815675574226, "learning_rate": 1.889977516231121e-05, "loss": 0.805, "step": 1898 }, { "epoch": 0.5226725383609716, "grad_norm": 0.26150363141638605, "learning_rate": 1.8882329336064892e-05, "loss": 0.8143, "step": 1899 }, { "epoch": 0.5229477740315145, "grad_norm": 0.2804131727213887, "learning_rate": 1.886488436286801e-05, "loss": 0.8133, "step": 1900 }, { "epoch": 0.5232230097020574, "grad_norm": 0.25048597469911027, "learning_rate": 1.8847440256035252e-05, "loss": 0.7654, "step": 1901 }, { "epoch": 0.5234982453726003, "grad_norm": 0.26999491057017366, "learning_rate": 1.8829997028880625e-05, "loss": 0.8118, "step": 1902 }, { "epoch": 0.5237734810431431, "grad_norm": 0.2775730535331951, "learning_rate": 1.881255469471748e-05, "loss": 0.7955, "step": 1903 }, { "epoch": 0.5240487167136861, "grad_norm": 0.2680115716391252, "learning_rate": 1.8795113266858483e-05, "loss": 0.7818, "step": 1904 }, { "epoch": 0.524323952384229, "grad_norm": 0.2752545455895527, "learning_rate": 1.8777672758615604e-05, "loss": 0.7856, "step": 1905 }, { "epoch": 0.5245991880547719, "grad_norm": 0.27231929454550835, "learning_rate": 1.8760233183300112e-05, "loss": 0.8003, "step": 1906 }, { "epoch": 0.5248744237253148, "grad_norm": 0.2798918244464111, "learning_rate": 1.8742794554222568e-05, "loss": 0.811, "step": 1907 }, { "epoch": 0.5251496593958577, "grad_norm": 0.286642385052349, "learning_rate": 1.87253568846928e-05, "loss": 0.7648, "step": 1908 }, { "epoch": 0.5254248950664006, "grad_norm": 0.2684095848293027, "learning_rate": 1.8707920188019917e-05, "loss": 0.7969, "step": 1909 }, { "epoch": 0.5257001307369435, "grad_norm": 0.2719302405508206, "learning_rate": 1.8690484477512272e-05, "loss": 0.7954, "step": 1910 }, { "epoch": 0.5259753664074864, "grad_norm": 0.2598519706473702, "learning_rate": 1.8673049766477488e-05, "loss": 0.8129, "step": 1911 }, { "epoch": 0.5262506020780293, "grad_norm": 0.2758876019629264, "learning_rate": 1.86556160682224e-05, "loss": 0.7725, "step": 1912 }, { "epoch": 0.5265258377485722, "grad_norm": 0.31537757282624546, "learning_rate": 1.863818339605308e-05, "loss": 0.7699, "step": 1913 }, { "epoch": 0.5268010734191151, "grad_norm": 0.26261219810372477, "learning_rate": 1.862075176327482e-05, "loss": 0.8071, "step": 1914 }, { "epoch": 0.527076309089658, "grad_norm": 0.25963322205954664, "learning_rate": 1.8603321183192118e-05, "loss": 0.773, "step": 1915 }, { "epoch": 0.527351544760201, "grad_norm": 0.279823419586967, "learning_rate": 1.8585891669108662e-05, "loss": 0.8112, "step": 1916 }, { "epoch": 0.5276267804307438, "grad_norm": 0.2836224263795011, "learning_rate": 1.856846323432733e-05, "loss": 0.7739, "step": 1917 }, { "epoch": 0.5279020161012867, "grad_norm": 0.7075412114248868, "learning_rate": 1.8551035892150176e-05, "loss": 0.8135, "step": 1918 }, { "epoch": 0.5281772517718296, "grad_norm": 0.27593565525559094, "learning_rate": 1.853360965587842e-05, "loss": 0.7884, "step": 1919 }, { "epoch": 0.5284524874423725, "grad_norm": 0.2648960687542547, "learning_rate": 1.8516184538812454e-05, "loss": 0.7755, "step": 1920 }, { "epoch": 0.5287277231129154, "grad_norm": 0.27460823148077607, "learning_rate": 1.8498760554251788e-05, "loss": 0.7938, "step": 1921 }, { "epoch": 0.5290029587834584, "grad_norm": 0.25980882335955263, "learning_rate": 1.848133771549508e-05, "loss": 0.7612, "step": 1922 }, { "epoch": 0.5292781944540013, "grad_norm": 0.2771174473857577, "learning_rate": 1.8463916035840114e-05, "loss": 0.7937, "step": 1923 }, { "epoch": 0.5295534301245441, "grad_norm": 0.25927594122103753, "learning_rate": 1.844649552858379e-05, "loss": 0.8126, "step": 1924 }, { "epoch": 0.529828665795087, "grad_norm": 0.28591007027338844, "learning_rate": 1.8429076207022107e-05, "loss": 0.8046, "step": 1925 }, { "epoch": 0.5301039014656299, "grad_norm": 0.2837803520167671, "learning_rate": 1.841165808445018e-05, "loss": 0.8083, "step": 1926 }, { "epoch": 0.5303791371361728, "grad_norm": 0.28474195324148543, "learning_rate": 1.8394241174162184e-05, "loss": 0.7906, "step": 1927 }, { "epoch": 0.5306543728067158, "grad_norm": 0.28226403517159054, "learning_rate": 1.837682548945138e-05, "loss": 0.7982, "step": 1928 }, { "epoch": 0.5309296084772587, "grad_norm": 0.2950887256233825, "learning_rate": 1.8359411043610083e-05, "loss": 0.8103, "step": 1929 }, { "epoch": 0.5312048441478016, "grad_norm": 0.3031647509151272, "learning_rate": 1.834199784992968e-05, "loss": 0.8108, "step": 1930 }, { "epoch": 0.5314800798183444, "grad_norm": 0.2709956390615864, "learning_rate": 1.8324585921700592e-05, "loss": 0.7783, "step": 1931 }, { "epoch": 0.5317553154888873, "grad_norm": 0.2902736025344367, "learning_rate": 1.8307175272212267e-05, "loss": 0.7876, "step": 1932 }, { "epoch": 0.5320305511594302, "grad_norm": 0.2774226425243251, "learning_rate": 1.82897659147532e-05, "loss": 0.7913, "step": 1933 }, { "epoch": 0.5323057868299732, "grad_norm": 0.3130545227057113, "learning_rate": 1.827235786261088e-05, "loss": 0.7881, "step": 1934 }, { "epoch": 0.5325810225005161, "grad_norm": 0.27805933752825074, "learning_rate": 1.8254951129071795e-05, "loss": 0.7695, "step": 1935 }, { "epoch": 0.532856258171059, "grad_norm": 0.2916677849268343, "learning_rate": 1.8237545727421455e-05, "loss": 0.8079, "step": 1936 }, { "epoch": 0.5331314938416019, "grad_norm": 0.2772032184415956, "learning_rate": 1.8220141670944322e-05, "loss": 0.8093, "step": 1937 }, { "epoch": 0.5334067295121447, "grad_norm": 0.31938560616910966, "learning_rate": 1.8202738972923848e-05, "loss": 0.7775, "step": 1938 }, { "epoch": 0.5336819651826876, "grad_norm": 0.28385693152230135, "learning_rate": 1.8185337646642436e-05, "loss": 0.7873, "step": 1939 }, { "epoch": 0.5339572008532306, "grad_norm": 0.2920670205085561, "learning_rate": 1.816793770538147e-05, "loss": 0.7941, "step": 1940 }, { "epoch": 0.5342324365237735, "grad_norm": 0.2523907852517901, "learning_rate": 1.8150539162421236e-05, "loss": 0.7784, "step": 1941 }, { "epoch": 0.5345076721943164, "grad_norm": 0.26872016920154207, "learning_rate": 1.8133142031040995e-05, "loss": 0.7688, "step": 1942 }, { "epoch": 0.5347829078648593, "grad_norm": 0.2630262231408708, "learning_rate": 1.81157463245189e-05, "loss": 0.782, "step": 1943 }, { "epoch": 0.5350581435354022, "grad_norm": 0.2463270254186401, "learning_rate": 1.809835205613202e-05, "loss": 0.7752, "step": 1944 }, { "epoch": 0.535333379205945, "grad_norm": 0.2550154470138386, "learning_rate": 1.808095923915634e-05, "loss": 0.8081, "step": 1945 }, { "epoch": 0.535608614876488, "grad_norm": 0.23806834115134007, "learning_rate": 1.8063567886866732e-05, "loss": 0.7873, "step": 1946 }, { "epoch": 0.5358838505470309, "grad_norm": 0.28055233795604595, "learning_rate": 1.804617801253694e-05, "loss": 0.7951, "step": 1947 }, { "epoch": 0.5361590862175738, "grad_norm": 0.25142578647695374, "learning_rate": 1.80287896294396e-05, "loss": 0.7438, "step": 1948 }, { "epoch": 0.5364343218881167, "grad_norm": 0.2848291687917642, "learning_rate": 1.8011402750846194e-05, "loss": 0.7922, "step": 1949 }, { "epoch": 0.5367095575586596, "grad_norm": 0.26649225347637134, "learning_rate": 1.7994017390027055e-05, "loss": 0.806, "step": 1950 }, { "epoch": 0.5369847932292025, "grad_norm": 0.25283388778282584, "learning_rate": 1.797663356025136e-05, "loss": 0.7918, "step": 1951 }, { "epoch": 0.5372600288997454, "grad_norm": 0.26392808637936516, "learning_rate": 1.795925127478713e-05, "loss": 0.8285, "step": 1952 }, { "epoch": 0.5375352645702883, "grad_norm": 0.24560111116558545, "learning_rate": 1.7941870546901178e-05, "loss": 0.7837, "step": 1953 }, { "epoch": 0.5378105002408312, "grad_norm": 0.28234253483537725, "learning_rate": 1.7924491389859172e-05, "loss": 0.7894, "step": 1954 }, { "epoch": 0.5380857359113741, "grad_norm": 0.2594093161334092, "learning_rate": 1.7907113816925546e-05, "loss": 0.8012, "step": 1955 }, { "epoch": 0.538360971581917, "grad_norm": 0.2779218796259984, "learning_rate": 1.788973784136353e-05, "loss": 0.7862, "step": 1956 }, { "epoch": 0.5386362072524599, "grad_norm": 0.2710859571554646, "learning_rate": 1.7872363476435142e-05, "loss": 0.7618, "step": 1957 }, { "epoch": 0.5389114429230029, "grad_norm": 0.2676404382532293, "learning_rate": 1.7854990735401174e-05, "loss": 0.8052, "step": 1958 }, { "epoch": 0.5391866785935457, "grad_norm": 0.2915039086597559, "learning_rate": 1.783761963152117e-05, "loss": 0.7833, "step": 1959 }, { "epoch": 0.5394619142640886, "grad_norm": 0.2501789605795621, "learning_rate": 1.782025017805342e-05, "loss": 0.7843, "step": 1960 }, { "epoch": 0.5397371499346315, "grad_norm": 0.26885451833283674, "learning_rate": 1.780288238825498e-05, "loss": 0.7741, "step": 1961 }, { "epoch": 0.5400123856051744, "grad_norm": 0.25660414107103297, "learning_rate": 1.77855162753816e-05, "loss": 0.7673, "step": 1962 }, { "epoch": 0.5402876212757173, "grad_norm": 0.2756794470378616, "learning_rate": 1.776815185268778e-05, "loss": 0.7916, "step": 1963 }, { "epoch": 0.5405628569462603, "grad_norm": 0.2648244605605683, "learning_rate": 1.7750789133426716e-05, "loss": 0.805, "step": 1964 }, { "epoch": 0.5408380926168032, "grad_norm": 0.27631326270263823, "learning_rate": 1.773342813085031e-05, "loss": 0.7911, "step": 1965 }, { "epoch": 0.541113328287346, "grad_norm": 0.25810654406129824, "learning_rate": 1.771606885820914e-05, "loss": 0.7807, "step": 1966 }, { "epoch": 0.5413885639578889, "grad_norm": 0.2852652284331418, "learning_rate": 1.7698711328752474e-05, "loss": 0.793, "step": 1967 }, { "epoch": 0.5416637996284318, "grad_norm": 0.25322695807372597, "learning_rate": 1.7681355555728257e-05, "loss": 0.7831, "step": 1968 }, { "epoch": 0.5419390352989747, "grad_norm": 0.27734794936103, "learning_rate": 1.766400155238309e-05, "loss": 0.786, "step": 1969 }, { "epoch": 0.5422142709695177, "grad_norm": 0.28437142254364345, "learning_rate": 1.7646649331962206e-05, "loss": 0.786, "step": 1970 }, { "epoch": 0.5424895066400606, "grad_norm": 0.2605076942673118, "learning_rate": 1.76292989077095e-05, "loss": 0.778, "step": 1971 }, { "epoch": 0.5427647423106035, "grad_norm": 0.2773342212685139, "learning_rate": 1.7611950292867476e-05, "loss": 0.77, "step": 1972 }, { "epoch": 0.5430399779811463, "grad_norm": 0.24994322864157123, "learning_rate": 1.759460350067728e-05, "loss": 0.7897, "step": 1973 }, { "epoch": 0.5433152136516892, "grad_norm": 0.2923516549091756, "learning_rate": 1.757725854437865e-05, "loss": 0.7555, "step": 1974 }, { "epoch": 0.5435904493222322, "grad_norm": 0.2530332743673048, "learning_rate": 1.7559915437209912e-05, "loss": 0.7776, "step": 1975 }, { "epoch": 0.5438656849927751, "grad_norm": 0.29298196056658304, "learning_rate": 1.7542574192408022e-05, "loss": 0.8423, "step": 1976 }, { "epoch": 0.544140920663318, "grad_norm": 0.2612608891964756, "learning_rate": 1.752523482320847e-05, "loss": 0.801, "step": 1977 }, { "epoch": 0.5444161563338609, "grad_norm": 0.26183185205119774, "learning_rate": 1.7507897342845338e-05, "loss": 0.7763, "step": 1978 }, { "epoch": 0.5446913920044038, "grad_norm": 0.28206152820613933, "learning_rate": 1.749056176455126e-05, "loss": 0.7919, "step": 1979 }, { "epoch": 0.5449666276749466, "grad_norm": 0.24624504473837036, "learning_rate": 1.747322810155742e-05, "loss": 0.7645, "step": 1980 }, { "epoch": 0.5452418633454896, "grad_norm": 0.2899629373082364, "learning_rate": 1.745589636709354e-05, "loss": 0.7709, "step": 1981 }, { "epoch": 0.5455170990160325, "grad_norm": 0.239110965729579, "learning_rate": 1.7438566574387864e-05, "loss": 0.7692, "step": 1982 }, { "epoch": 0.5457923346865754, "grad_norm": 0.25955568535138723, "learning_rate": 1.742123873666717e-05, "loss": 0.7918, "step": 1983 }, { "epoch": 0.5460675703571183, "grad_norm": 0.2563500845539961, "learning_rate": 1.740391286715672e-05, "loss": 0.7589, "step": 1984 }, { "epoch": 0.5463428060276612, "grad_norm": 0.27264598390987943, "learning_rate": 1.7386588979080303e-05, "loss": 0.8072, "step": 1985 }, { "epoch": 0.546618041698204, "grad_norm": 0.25736047422488356, "learning_rate": 1.7369267085660167e-05, "loss": 0.7853, "step": 1986 }, { "epoch": 0.546893277368747, "grad_norm": 0.25806570825572667, "learning_rate": 1.7351947200117057e-05, "loss": 0.7802, "step": 1987 }, { "epoch": 0.5471685130392899, "grad_norm": 0.2485352254529487, "learning_rate": 1.7334629335670176e-05, "loss": 0.7829, "step": 1988 }, { "epoch": 0.5474437487098328, "grad_norm": 0.26105039379455175, "learning_rate": 1.7317313505537184e-05, "loss": 0.7842, "step": 1989 }, { "epoch": 0.5477189843803757, "grad_norm": 0.23580315780505323, "learning_rate": 1.72999997229342e-05, "loss": 0.7857, "step": 1990 }, { "epoch": 0.5479942200509186, "grad_norm": 0.2597530681592618, "learning_rate": 1.7282688001075766e-05, "loss": 0.7875, "step": 1991 }, { "epoch": 0.5482694557214615, "grad_norm": 0.2618080097326894, "learning_rate": 1.7265378353174865e-05, "loss": 0.7899, "step": 1992 }, { "epoch": 0.5485446913920045, "grad_norm": 0.2344790619860551, "learning_rate": 1.724807079244288e-05, "loss": 0.7602, "step": 1993 }, { "epoch": 0.5488199270625473, "grad_norm": 0.25733670318435226, "learning_rate": 1.7230765332089613e-05, "loss": 0.7769, "step": 1994 }, { "epoch": 0.5490951627330902, "grad_norm": 0.24042937004196524, "learning_rate": 1.721346198532326e-05, "loss": 0.7698, "step": 1995 }, { "epoch": 0.5493703984036331, "grad_norm": 0.2421860357089892, "learning_rate": 1.71961607653504e-05, "loss": 0.7814, "step": 1996 }, { "epoch": 0.549645634074176, "grad_norm": 0.24364077587803198, "learning_rate": 1.7178861685376004e-05, "loss": 0.7571, "step": 1997 }, { "epoch": 0.5499208697447189, "grad_norm": 0.2447840232382594, "learning_rate": 1.7161564758603392e-05, "loss": 0.7752, "step": 1998 }, { "epoch": 0.5501961054152619, "grad_norm": 0.628668640344974, "learning_rate": 1.7144269998234244e-05, "loss": 0.7966, "step": 1999 }, { "epoch": 0.5504713410858048, "grad_norm": 0.2571287619838796, "learning_rate": 1.712697741746859e-05, "loss": 0.8053, "step": 2000 }, { "epoch": 0.5507465767563476, "grad_norm": 0.26695455010383273, "learning_rate": 1.7109687029504805e-05, "loss": 0.7676, "step": 2001 }, { "epoch": 0.5510218124268905, "grad_norm": 0.2573235757930642, "learning_rate": 1.709239884753957e-05, "loss": 0.814, "step": 2002 }, { "epoch": 0.5512970480974334, "grad_norm": 0.2751273845472534, "learning_rate": 1.707511288476789e-05, "loss": 0.805, "step": 2003 }, { "epoch": 0.5515722837679763, "grad_norm": 0.272689116456555, "learning_rate": 1.7057829154383095e-05, "loss": 0.7824, "step": 2004 }, { "epoch": 0.5518475194385193, "grad_norm": 0.2740587491604914, "learning_rate": 1.704054766957679e-05, "loss": 0.7973, "step": 2005 }, { "epoch": 0.5521227551090622, "grad_norm": 0.25333928990399196, "learning_rate": 1.7023268443538868e-05, "loss": 0.8045, "step": 2006 }, { "epoch": 0.552397990779605, "grad_norm": 0.26336090249565214, "learning_rate": 1.700599148945751e-05, "loss": 0.7995, "step": 2007 }, { "epoch": 0.5526732264501479, "grad_norm": 0.2621200552071566, "learning_rate": 1.6988716820519145e-05, "loss": 0.766, "step": 2008 }, { "epoch": 0.5529484621206908, "grad_norm": 0.25582732912748624, "learning_rate": 1.6971444449908474e-05, "loss": 0.7864, "step": 2009 }, { "epoch": 0.5532236977912337, "grad_norm": 0.2604799412347714, "learning_rate": 1.695417439080843e-05, "loss": 0.7877, "step": 2010 }, { "epoch": 0.5534989334617767, "grad_norm": 0.25328887720164894, "learning_rate": 1.6936906656400197e-05, "loss": 0.7656, "step": 2011 }, { "epoch": 0.5537741691323196, "grad_norm": 0.2534440213109559, "learning_rate": 1.691964125986318e-05, "loss": 0.7907, "step": 2012 }, { "epoch": 0.5540494048028625, "grad_norm": 0.2476470570149038, "learning_rate": 1.6902378214374995e-05, "loss": 0.7697, "step": 2013 }, { "epoch": 0.5543246404734054, "grad_norm": 0.2694213175230382, "learning_rate": 1.6885117533111463e-05, "loss": 0.7988, "step": 2014 }, { "epoch": 0.5545998761439482, "grad_norm": 0.30841770855502404, "learning_rate": 1.68678592292466e-05, "loss": 0.7796, "step": 2015 }, { "epoch": 0.5548751118144911, "grad_norm": 0.25753374373992016, "learning_rate": 1.6850603315952613e-05, "loss": 0.776, "step": 2016 }, { "epoch": 0.5551503474850341, "grad_norm": 0.2437887936932628, "learning_rate": 1.683334980639988e-05, "loss": 0.7712, "step": 2017 }, { "epoch": 0.555425583155577, "grad_norm": 0.26321086219079554, "learning_rate": 1.6816098713756956e-05, "loss": 0.7709, "step": 2018 }, { "epoch": 0.5557008188261199, "grad_norm": 0.24695977090983962, "learning_rate": 1.679885005119053e-05, "loss": 0.7985, "step": 2019 }, { "epoch": 0.5559760544966628, "grad_norm": 0.2764422194698112, "learning_rate": 1.6781603831865457e-05, "loss": 0.7687, "step": 2020 }, { "epoch": 0.5562512901672056, "grad_norm": 0.26101368078997494, "learning_rate": 1.6764360068944706e-05, "loss": 0.7706, "step": 2021 }, { "epoch": 0.5565265258377485, "grad_norm": 0.2755045165307206, "learning_rate": 1.6747118775589398e-05, "loss": 0.769, "step": 2022 }, { "epoch": 0.5568017615082915, "grad_norm": 0.26081237103206856, "learning_rate": 1.6729879964958744e-05, "loss": 0.7376, "step": 2023 }, { "epoch": 0.5570769971788344, "grad_norm": 0.27554314183027323, "learning_rate": 1.6712643650210074e-05, "loss": 0.7848, "step": 2024 }, { "epoch": 0.5573522328493773, "grad_norm": 0.28565893837510764, "learning_rate": 1.66954098444988e-05, "loss": 0.7632, "step": 2025 }, { "epoch": 0.5576274685199202, "grad_norm": 0.2737243329516259, "learning_rate": 1.6678178560978448e-05, "loss": 0.8029, "step": 2026 }, { "epoch": 0.5579027041904631, "grad_norm": 0.299381249200942, "learning_rate": 1.6660949812800584e-05, "loss": 0.7776, "step": 2027 }, { "epoch": 0.558177939861006, "grad_norm": 0.2638957717972394, "learning_rate": 1.6643723613114862e-05, "loss": 0.7969, "step": 2028 }, { "epoch": 0.5584531755315489, "grad_norm": 0.30472460658726175, "learning_rate": 1.6626499975068982e-05, "loss": 0.7797, "step": 2029 }, { "epoch": 0.5587284112020918, "grad_norm": 0.2590340483031841, "learning_rate": 1.6609278911808688e-05, "loss": 0.7547, "step": 2030 }, { "epoch": 0.5590036468726347, "grad_norm": 0.31529266976023407, "learning_rate": 1.659206043647776e-05, "loss": 0.7578, "step": 2031 }, { "epoch": 0.5592788825431776, "grad_norm": 0.25403425411898994, "learning_rate": 1.6574844562218e-05, "loss": 0.7751, "step": 2032 }, { "epoch": 0.5595541182137205, "grad_norm": 0.29800038972180426, "learning_rate": 1.6557631302169236e-05, "loss": 0.7718, "step": 2033 }, { "epoch": 0.5598293538842634, "grad_norm": 0.2741538551149542, "learning_rate": 1.6540420669469298e-05, "loss": 0.7611, "step": 2034 }, { "epoch": 0.5601045895548064, "grad_norm": 0.32261246465357896, "learning_rate": 1.6523212677253996e-05, "loss": 0.7896, "step": 2035 }, { "epoch": 0.5603798252253492, "grad_norm": 0.284204794938927, "learning_rate": 1.650600733865714e-05, "loss": 0.7836, "step": 2036 }, { "epoch": 0.5606550608958921, "grad_norm": 0.26216419660183365, "learning_rate": 1.6488804666810504e-05, "loss": 0.7828, "step": 2037 }, { "epoch": 0.560930296566435, "grad_norm": 0.2957938006575376, "learning_rate": 1.647160467484384e-05, "loss": 0.7812, "step": 2038 }, { "epoch": 0.5612055322369779, "grad_norm": 0.24078415224646846, "learning_rate": 1.6454407375884828e-05, "loss": 0.759, "step": 2039 }, { "epoch": 0.5614807679075208, "grad_norm": 0.28878220901442014, "learning_rate": 1.6437212783059136e-05, "loss": 0.7706, "step": 2040 }, { "epoch": 0.5617560035780638, "grad_norm": 0.24912996279183475, "learning_rate": 1.642002090949033e-05, "loss": 0.7904, "step": 2041 }, { "epoch": 0.5620312392486067, "grad_norm": 0.2907681941664777, "learning_rate": 1.6402831768299913e-05, "loss": 0.7843, "step": 2042 }, { "epoch": 0.5623064749191495, "grad_norm": 0.23475718522735167, "learning_rate": 1.63856453726073e-05, "loss": 0.7858, "step": 2043 }, { "epoch": 0.5625817105896924, "grad_norm": 0.25802734415634354, "learning_rate": 1.6368461735529816e-05, "loss": 0.8037, "step": 2044 }, { "epoch": 0.5628569462602353, "grad_norm": 0.22740669438433816, "learning_rate": 1.635128087018268e-05, "loss": 0.7536, "step": 2045 }, { "epoch": 0.5631321819307782, "grad_norm": 0.2532030148949126, "learning_rate": 1.6334102789678973e-05, "loss": 0.7958, "step": 2046 }, { "epoch": 0.5634074176013212, "grad_norm": 0.24557486295621084, "learning_rate": 1.631692750712969e-05, "loss": 0.7848, "step": 2047 }, { "epoch": 0.5636826532718641, "grad_norm": 0.252681918118479, "learning_rate": 1.6299755035643668e-05, "loss": 0.7726, "step": 2048 }, { "epoch": 0.563957888942407, "grad_norm": 0.2539413400854405, "learning_rate": 1.6282585388327596e-05, "loss": 0.7772, "step": 2049 }, { "epoch": 0.5642331246129498, "grad_norm": 0.25698067561539034, "learning_rate": 1.6265418578286016e-05, "loss": 0.7544, "step": 2050 }, { "epoch": 0.5645083602834927, "grad_norm": 0.25552410852604446, "learning_rate": 1.62482546186213e-05, "loss": 0.7657, "step": 2051 }, { "epoch": 0.5647835959540356, "grad_norm": 0.25547516308813145, "learning_rate": 1.6231093522433644e-05, "loss": 0.7841, "step": 2052 }, { "epoch": 0.5650588316245786, "grad_norm": 0.23919288113864054, "learning_rate": 1.6213935302821048e-05, "loss": 0.7812, "step": 2053 }, { "epoch": 0.5653340672951215, "grad_norm": 0.24517970086684646, "learning_rate": 1.6196779972879342e-05, "loss": 0.7708, "step": 2054 }, { "epoch": 0.5656093029656644, "grad_norm": 0.24938526180701784, "learning_rate": 1.6179627545702146e-05, "loss": 0.759, "step": 2055 }, { "epoch": 0.5658845386362072, "grad_norm": 0.24762322015857288, "learning_rate": 1.6162478034380843e-05, "loss": 0.7662, "step": 2056 }, { "epoch": 0.5661597743067501, "grad_norm": 0.24722313649073263, "learning_rate": 1.61453314520046e-05, "loss": 0.7777, "step": 2057 }, { "epoch": 0.566435009977293, "grad_norm": 0.25320830188852356, "learning_rate": 1.612818781166035e-05, "loss": 0.7807, "step": 2058 }, { "epoch": 0.566710245647836, "grad_norm": 0.36654122738915146, "learning_rate": 1.6111047126432794e-05, "loss": 0.7838, "step": 2059 }, { "epoch": 0.5669854813183789, "grad_norm": 0.26133322944692217, "learning_rate": 1.6093909409404352e-05, "loss": 0.7798, "step": 2060 }, { "epoch": 0.5672607169889218, "grad_norm": 0.2881526767960742, "learning_rate": 1.6076774673655204e-05, "loss": 0.8043, "step": 2061 }, { "epoch": 0.5675359526594647, "grad_norm": 0.2525406598946525, "learning_rate": 1.6059642932263235e-05, "loss": 0.8085, "step": 2062 }, { "epoch": 0.5678111883300075, "grad_norm": 0.2498535732259371, "learning_rate": 1.6042514198304056e-05, "loss": 0.783, "step": 2063 }, { "epoch": 0.5680864240005504, "grad_norm": 0.2466455408249282, "learning_rate": 1.602538848485097e-05, "loss": 0.7676, "step": 2064 }, { "epoch": 0.5683616596710934, "grad_norm": 0.24582045772189817, "learning_rate": 1.6008265804974998e-05, "loss": 0.7559, "step": 2065 }, { "epoch": 0.5686368953416363, "grad_norm": 0.25116044544645955, "learning_rate": 1.599114617174482e-05, "loss": 0.786, "step": 2066 }, { "epoch": 0.5689121310121792, "grad_norm": 0.2506465479046168, "learning_rate": 1.5974029598226796e-05, "loss": 0.7845, "step": 2067 }, { "epoch": 0.5691873666827221, "grad_norm": 0.24848271715394182, "learning_rate": 1.5956916097484975e-05, "loss": 0.7795, "step": 2068 }, { "epoch": 0.569462602353265, "grad_norm": 0.2402544233325968, "learning_rate": 1.593980568258103e-05, "loss": 0.7936, "step": 2069 }, { "epoch": 0.5697378380238078, "grad_norm": 0.2549298842223705, "learning_rate": 1.592269836657429e-05, "loss": 0.752, "step": 2070 }, { "epoch": 0.5700130736943508, "grad_norm": 0.23611264837052431, "learning_rate": 1.5905594162521725e-05, "loss": 0.7971, "step": 2071 }, { "epoch": 0.5702883093648937, "grad_norm": 0.24434415474606616, "learning_rate": 1.5888493083477926e-05, "loss": 0.7524, "step": 2072 }, { "epoch": 0.5705635450354366, "grad_norm": 0.2503065691490492, "learning_rate": 1.587139514249509e-05, "loss": 0.8098, "step": 2073 }, { "epoch": 0.5708387807059795, "grad_norm": 0.24329055635485347, "learning_rate": 1.5854300352623023e-05, "loss": 0.7398, "step": 2074 }, { "epoch": 0.5711140163765224, "grad_norm": 0.2572120138053558, "learning_rate": 1.583720872690914e-05, "loss": 0.761, "step": 2075 }, { "epoch": 0.5713892520470653, "grad_norm": 0.23979776297460612, "learning_rate": 1.5820120278398424e-05, "loss": 0.8041, "step": 2076 }, { "epoch": 0.5716644877176082, "grad_norm": 0.2624330874838944, "learning_rate": 1.5803035020133448e-05, "loss": 0.7963, "step": 2077 }, { "epoch": 0.5719397233881511, "grad_norm": 0.23889894527585132, "learning_rate": 1.578595296515433e-05, "loss": 0.7865, "step": 2078 }, { "epoch": 0.572214959058694, "grad_norm": 0.25073987996449615, "learning_rate": 1.5768874126498766e-05, "loss": 0.7892, "step": 2079 }, { "epoch": 0.5724901947292369, "grad_norm": 0.23814982283441102, "learning_rate": 1.5751798517201972e-05, "loss": 0.8236, "step": 2080 }, { "epoch": 0.5727654303997798, "grad_norm": 0.2539968039401214, "learning_rate": 1.5734726150296725e-05, "loss": 0.7881, "step": 2081 }, { "epoch": 0.5730406660703227, "grad_norm": 0.25969761411521936, "learning_rate": 1.57176570388133e-05, "loss": 0.8042, "step": 2082 }, { "epoch": 0.5733159017408657, "grad_norm": 0.2375301691335384, "learning_rate": 1.570059119577952e-05, "loss": 0.7835, "step": 2083 }, { "epoch": 0.5735911374114085, "grad_norm": 0.2626076389318922, "learning_rate": 1.568352863422069e-05, "loss": 0.7935, "step": 2084 }, { "epoch": 0.5738663730819514, "grad_norm": 0.23009338452442388, "learning_rate": 1.5666469367159613e-05, "loss": 0.7742, "step": 2085 }, { "epoch": 0.5741416087524943, "grad_norm": 0.2619299816428588, "learning_rate": 1.564941340761658e-05, "loss": 0.7642, "step": 2086 }, { "epoch": 0.5744168444230372, "grad_norm": 0.25405711124962455, "learning_rate": 1.563236076860937e-05, "loss": 0.765, "step": 2087 }, { "epoch": 0.5746920800935801, "grad_norm": 0.2645673429868364, "learning_rate": 1.56153114631532e-05, "loss": 0.7861, "step": 2088 }, { "epoch": 0.5749673157641231, "grad_norm": 0.2525486695505516, "learning_rate": 1.559826550426076e-05, "loss": 0.7944, "step": 2089 }, { "epoch": 0.575242551434666, "grad_norm": 0.23315936847014512, "learning_rate": 1.55812229049422e-05, "loss": 0.7585, "step": 2090 }, { "epoch": 0.5755177871052088, "grad_norm": 0.2551352924648423, "learning_rate": 1.5564183678205074e-05, "loss": 0.7463, "step": 2091 }, { "epoch": 0.5757930227757517, "grad_norm": 0.22795328646090968, "learning_rate": 1.5547147837054392e-05, "loss": 0.7966, "step": 2092 }, { "epoch": 0.5760682584462946, "grad_norm": 0.24822441078388702, "learning_rate": 1.553011539449256e-05, "loss": 0.7869, "step": 2093 }, { "epoch": 0.5763434941168375, "grad_norm": 0.24696033515063354, "learning_rate": 1.5513086363519392e-05, "loss": 0.7625, "step": 2094 }, { "epoch": 0.5766187297873805, "grad_norm": 0.23207065303808233, "learning_rate": 1.5496060757132112e-05, "loss": 0.7887, "step": 2095 }, { "epoch": 0.5768939654579234, "grad_norm": 0.24032261378587064, "learning_rate": 1.5479038588325303e-05, "loss": 0.7783, "step": 2096 }, { "epoch": 0.5771692011284663, "grad_norm": 0.2616143376584418, "learning_rate": 1.546201987009096e-05, "loss": 0.7939, "step": 2097 }, { "epoch": 0.5774444367990091, "grad_norm": 0.25238477358610734, "learning_rate": 1.5445004615418425e-05, "loss": 0.7854, "step": 2098 }, { "epoch": 0.577719672469552, "grad_norm": 0.27345842456168395, "learning_rate": 1.5427992837294393e-05, "loss": 0.7705, "step": 2099 }, { "epoch": 0.5779949081400949, "grad_norm": 0.24797525297088446, "learning_rate": 1.5410984548702913e-05, "loss": 0.7754, "step": 2100 }, { "epoch": 0.5782701438106379, "grad_norm": 0.2547197947186458, "learning_rate": 1.5393979762625363e-05, "loss": 0.8208, "step": 2101 }, { "epoch": 0.5785453794811808, "grad_norm": 0.24702558543063802, "learning_rate": 1.5376978492040455e-05, "loss": 0.77, "step": 2102 }, { "epoch": 0.5788206151517237, "grad_norm": 0.25100614022554396, "learning_rate": 1.5359980749924212e-05, "loss": 0.7638, "step": 2103 }, { "epoch": 0.5790958508222666, "grad_norm": 0.2460103820922042, "learning_rate": 1.534298654924998e-05, "loss": 0.7929, "step": 2104 }, { "epoch": 0.5793710864928094, "grad_norm": 0.24290709268941307, "learning_rate": 1.5325995902988386e-05, "loss": 0.7885, "step": 2105 }, { "epoch": 0.5796463221633523, "grad_norm": 0.23892080646614608, "learning_rate": 1.530900882410734e-05, "loss": 0.8172, "step": 2106 }, { "epoch": 0.5799215578338953, "grad_norm": 0.24946689388002227, "learning_rate": 1.5292025325572035e-05, "loss": 0.7684, "step": 2107 }, { "epoch": 0.5801967935044382, "grad_norm": 0.23196145630886625, "learning_rate": 1.5275045420344947e-05, "loss": 0.7778, "step": 2108 }, { "epoch": 0.5804720291749811, "grad_norm": 0.26101729842998894, "learning_rate": 1.5258069121385789e-05, "loss": 0.8088, "step": 2109 }, { "epoch": 0.580747264845524, "grad_norm": 0.24520871895155857, "learning_rate": 1.5241096441651518e-05, "loss": 0.7919, "step": 2110 }, { "epoch": 0.5810225005160669, "grad_norm": 0.25198000687125316, "learning_rate": 1.5224127394096357e-05, "loss": 0.7777, "step": 2111 }, { "epoch": 0.5812977361866097, "grad_norm": 0.23740133013705317, "learning_rate": 1.520716199167173e-05, "loss": 0.7272, "step": 2112 }, { "epoch": 0.5815729718571527, "grad_norm": 0.2415496612068821, "learning_rate": 1.5190200247326286e-05, "loss": 0.7951, "step": 2113 }, { "epoch": 0.5818482075276956, "grad_norm": 0.2548145164442953, "learning_rate": 1.517324217400589e-05, "loss": 0.7824, "step": 2114 }, { "epoch": 0.5821234431982385, "grad_norm": 0.23201023898433434, "learning_rate": 1.5156287784653594e-05, "loss": 0.8018, "step": 2115 }, { "epoch": 0.5823986788687814, "grad_norm": 0.29720187732594855, "learning_rate": 1.5139337092209645e-05, "loss": 0.7733, "step": 2116 }, { "epoch": 0.5826739145393243, "grad_norm": 0.23146801505485573, "learning_rate": 1.5122390109611458e-05, "loss": 0.8012, "step": 2117 }, { "epoch": 0.5829491502098672, "grad_norm": 0.2482537042383629, "learning_rate": 1.510544684979364e-05, "loss": 0.7852, "step": 2118 }, { "epoch": 0.5832243858804101, "grad_norm": 0.24202146603866012, "learning_rate": 1.5088507325687931e-05, "loss": 0.7807, "step": 2119 }, { "epoch": 0.583499621550953, "grad_norm": 1.2876778355705045, "learning_rate": 1.5071571550223238e-05, "loss": 0.7896, "step": 2120 }, { "epoch": 0.5837748572214959, "grad_norm": 0.2506389255098671, "learning_rate": 1.5054639536325595e-05, "loss": 0.791, "step": 2121 }, { "epoch": 0.5840500928920388, "grad_norm": 0.2594330635551027, "learning_rate": 1.5037711296918169e-05, "loss": 0.7851, "step": 2122 }, { "epoch": 0.5843253285625817, "grad_norm": 0.257735357416782, "learning_rate": 1.5020786844921245e-05, "loss": 0.7968, "step": 2123 }, { "epoch": 0.5846005642331246, "grad_norm": 0.23623757668631695, "learning_rate": 1.500386619325222e-05, "loss": 0.7427, "step": 2124 }, { "epoch": 0.5848757999036676, "grad_norm": 0.27831653869214235, "learning_rate": 1.498694935482559e-05, "loss": 0.8033, "step": 2125 }, { "epoch": 0.5851510355742104, "grad_norm": 0.25548238246861205, "learning_rate": 1.497003634255294e-05, "loss": 0.7979, "step": 2126 }, { "epoch": 0.5854262712447533, "grad_norm": 0.2593386786294403, "learning_rate": 1.495312716934294e-05, "loss": 0.7706, "step": 2127 }, { "epoch": 0.5857015069152962, "grad_norm": 0.24010246593866066, "learning_rate": 1.4936221848101315e-05, "loss": 0.7941, "step": 2128 }, { "epoch": 0.5859767425858391, "grad_norm": 0.2701389034455053, "learning_rate": 1.4919320391730862e-05, "loss": 0.7741, "step": 2129 }, { "epoch": 0.586251978256382, "grad_norm": 0.24203094574073675, "learning_rate": 1.4902422813131433e-05, "loss": 0.7661, "step": 2130 }, { "epoch": 0.586527213926925, "grad_norm": 0.2671619521416786, "learning_rate": 1.4885529125199902e-05, "loss": 0.7701, "step": 2131 }, { "epoch": 0.5868024495974679, "grad_norm": 0.252204521046989, "learning_rate": 1.4868639340830185e-05, "loss": 0.7724, "step": 2132 }, { "epoch": 0.5870776852680107, "grad_norm": 0.2726669507042234, "learning_rate": 1.4851753472913228e-05, "loss": 0.7959, "step": 2133 }, { "epoch": 0.5873529209385536, "grad_norm": 0.24867246127598813, "learning_rate": 1.4834871534336972e-05, "loss": 0.8058, "step": 2134 }, { "epoch": 0.5876281566090965, "grad_norm": 0.26141394866877476, "learning_rate": 1.4817993537986368e-05, "loss": 0.768, "step": 2135 }, { "epoch": 0.5879033922796394, "grad_norm": 0.2587387847788051, "learning_rate": 1.4801119496743353e-05, "loss": 0.7864, "step": 2136 }, { "epoch": 0.5881786279501824, "grad_norm": 0.2511675708237603, "learning_rate": 1.4784249423486845e-05, "loss": 0.7793, "step": 2137 }, { "epoch": 0.5884538636207253, "grad_norm": 0.25023817845508245, "learning_rate": 1.4767383331092737e-05, "loss": 0.7679, "step": 2138 }, { "epoch": 0.5887290992912682, "grad_norm": 0.26930821744504835, "learning_rate": 1.4750521232433879e-05, "loss": 0.7976, "step": 2139 }, { "epoch": 0.589004334961811, "grad_norm": 0.23473374959504417, "learning_rate": 1.4733663140380081e-05, "loss": 0.7897, "step": 2140 }, { "epoch": 0.5892795706323539, "grad_norm": 0.2706088837656889, "learning_rate": 1.4716809067798097e-05, "loss": 0.7771, "step": 2141 }, { "epoch": 0.5895548063028968, "grad_norm": 0.22751531787456653, "learning_rate": 1.4699959027551598e-05, "loss": 0.7703, "step": 2142 }, { "epoch": 0.5898300419734398, "grad_norm": 0.2821473648131303, "learning_rate": 1.4683113032501188e-05, "loss": 0.7862, "step": 2143 }, { "epoch": 0.5901052776439827, "grad_norm": 0.2381236393678348, "learning_rate": 1.4666271095504377e-05, "loss": 0.7868, "step": 2144 }, { "epoch": 0.5903805133145256, "grad_norm": 0.35446051068971185, "learning_rate": 1.4649433229415588e-05, "loss": 0.7926, "step": 2145 }, { "epoch": 0.5906557489850685, "grad_norm": 0.23549622075698776, "learning_rate": 1.4632599447086123e-05, "loss": 0.793, "step": 2146 }, { "epoch": 0.5909309846556113, "grad_norm": 0.320356279305611, "learning_rate": 1.461576976136419e-05, "loss": 0.7905, "step": 2147 }, { "epoch": 0.5912062203261542, "grad_norm": 0.22676829503990834, "learning_rate": 1.4598944185094843e-05, "loss": 0.7581, "step": 2148 }, { "epoch": 0.5914814559966972, "grad_norm": 0.2652106589087, "learning_rate": 1.4582122731120018e-05, "loss": 0.778, "step": 2149 }, { "epoch": 0.5917566916672401, "grad_norm": 0.23972531109798575, "learning_rate": 1.4565305412278492e-05, "loss": 0.7959, "step": 2150 }, { "epoch": 0.592031927337783, "grad_norm": 0.27323471299885144, "learning_rate": 1.4548492241405902e-05, "loss": 0.7419, "step": 2151 }, { "epoch": 0.5923071630083259, "grad_norm": 0.24884190065273046, "learning_rate": 1.4531683231334705e-05, "loss": 0.789, "step": 2152 }, { "epoch": 0.5925823986788687, "grad_norm": 0.2749619768226162, "learning_rate": 1.4514878394894179e-05, "loss": 0.7795, "step": 2153 }, { "epoch": 0.5928576343494116, "grad_norm": 0.24176446091523318, "learning_rate": 1.449807774491044e-05, "loss": 0.776, "step": 2154 }, { "epoch": 0.5931328700199546, "grad_norm": 0.26236281769206066, "learning_rate": 1.4481281294206384e-05, "loss": 0.7911, "step": 2155 }, { "epoch": 0.5934081056904975, "grad_norm": 0.24418485563127318, "learning_rate": 1.4464489055601711e-05, "loss": 0.7624, "step": 2156 }, { "epoch": 0.5936833413610404, "grad_norm": 0.24652423998659218, "learning_rate": 1.4447701041912913e-05, "loss": 0.7798, "step": 2157 }, { "epoch": 0.5939585770315833, "grad_norm": 0.25788494067196344, "learning_rate": 1.4430917265953249e-05, "loss": 0.7896, "step": 2158 }, { "epoch": 0.5942338127021262, "grad_norm": 0.24422235437206616, "learning_rate": 1.441413774053274e-05, "loss": 0.7814, "step": 2159 }, { "epoch": 0.594509048372669, "grad_norm": 0.2698403209678665, "learning_rate": 1.4397362478458161e-05, "loss": 0.7979, "step": 2160 }, { "epoch": 0.594784284043212, "grad_norm": 0.4504993518957787, "learning_rate": 1.438059149253306e-05, "loss": 0.8036, "step": 2161 }, { "epoch": 0.5950595197137549, "grad_norm": 0.24344810912433718, "learning_rate": 1.4363824795557688e-05, "loss": 0.8054, "step": 2162 }, { "epoch": 0.5953347553842978, "grad_norm": 0.2469985797108369, "learning_rate": 1.4347062400329046e-05, "loss": 0.7752, "step": 2163 }, { "epoch": 0.5956099910548407, "grad_norm": 0.2490002922314942, "learning_rate": 1.4330304319640834e-05, "loss": 0.7929, "step": 2164 }, { "epoch": 0.5958852267253836, "grad_norm": 0.2429649803979391, "learning_rate": 1.4313550566283466e-05, "loss": 0.7888, "step": 2165 }, { "epoch": 0.5961604623959265, "grad_norm": 0.24481603235890745, "learning_rate": 1.4296801153044055e-05, "loss": 0.7885, "step": 2166 }, { "epoch": 0.5964356980664695, "grad_norm": 0.3080950706307856, "learning_rate": 1.4280056092706405e-05, "loss": 0.7915, "step": 2167 }, { "epoch": 0.5967109337370123, "grad_norm": 0.30616858358392884, "learning_rate": 1.4263315398050986e-05, "loss": 0.7635, "step": 2168 }, { "epoch": 0.5969861694075552, "grad_norm": 0.24086458449552933, "learning_rate": 1.4246579081854953e-05, "loss": 0.7856, "step": 2169 }, { "epoch": 0.5972614050780981, "grad_norm": 0.26227700423509437, "learning_rate": 1.4229847156892102e-05, "loss": 0.7935, "step": 2170 }, { "epoch": 0.597536640748641, "grad_norm": 0.22949999843540742, "learning_rate": 1.4213119635932889e-05, "loss": 0.8084, "step": 2171 }, { "epoch": 0.5978118764191839, "grad_norm": 0.3656471073242874, "learning_rate": 1.4196396531744397e-05, "loss": 0.743, "step": 2172 }, { "epoch": 0.5980871120897269, "grad_norm": 0.22416655439542127, "learning_rate": 1.4179677857090353e-05, "loss": 0.7608, "step": 2173 }, { "epoch": 0.5983623477602698, "grad_norm": 0.231795374036869, "learning_rate": 1.4162963624731083e-05, "loss": 0.7713, "step": 2174 }, { "epoch": 0.5986375834308126, "grad_norm": 0.23566291636624995, "learning_rate": 1.4146253847423555e-05, "loss": 0.7864, "step": 2175 }, { "epoch": 0.5989128191013555, "grad_norm": 0.2517273479510146, "learning_rate": 1.4129548537921308e-05, "loss": 0.7865, "step": 2176 }, { "epoch": 0.5991880547718984, "grad_norm": 0.22410139984949565, "learning_rate": 1.4112847708974471e-05, "loss": 0.7909, "step": 2177 }, { "epoch": 0.5994632904424413, "grad_norm": 0.26533693472104647, "learning_rate": 1.4096151373329777e-05, "loss": 0.7648, "step": 2178 }, { "epoch": 0.5997385261129843, "grad_norm": 0.23050780141550972, "learning_rate": 1.4079459543730504e-05, "loss": 0.779, "step": 2179 }, { "epoch": 0.6000137617835272, "grad_norm": 0.265280586075992, "learning_rate": 1.4062772232916507e-05, "loss": 0.7648, "step": 2180 }, { "epoch": 0.60028899745407, "grad_norm": 0.23622287788664692, "learning_rate": 1.4046089453624181e-05, "loss": 0.7902, "step": 2181 }, { "epoch": 0.6005642331246129, "grad_norm": 0.23610116149772342, "learning_rate": 1.4029411218586464e-05, "loss": 0.7497, "step": 2182 }, { "epoch": 0.6008394687951558, "grad_norm": 0.2234032434678829, "learning_rate": 1.4012737540532842e-05, "loss": 0.7719, "step": 2183 }, { "epoch": 0.6011147044656987, "grad_norm": 0.2504256253889632, "learning_rate": 1.3996068432189305e-05, "loss": 0.7751, "step": 2184 }, { "epoch": 0.6013899401362417, "grad_norm": 0.23610034084470008, "learning_rate": 1.3979403906278362e-05, "loss": 0.7867, "step": 2185 }, { "epoch": 0.6016651758067846, "grad_norm": 0.26373980797829544, "learning_rate": 1.3962743975519021e-05, "loss": 0.7916, "step": 2186 }, { "epoch": 0.6019404114773275, "grad_norm": 0.2335537381776213, "learning_rate": 1.3946088652626784e-05, "loss": 0.8085, "step": 2187 }, { "epoch": 0.6022156471478703, "grad_norm": 0.2576714206143028, "learning_rate": 1.392943795031364e-05, "loss": 0.7874, "step": 2188 }, { "epoch": 0.6024908828184132, "grad_norm": 0.2316709720621963, "learning_rate": 1.391279188128804e-05, "loss": 0.7803, "step": 2189 }, { "epoch": 0.6027661184889561, "grad_norm": 0.25280051602807724, "learning_rate": 1.389615045825492e-05, "loss": 0.7759, "step": 2190 }, { "epoch": 0.6030413541594991, "grad_norm": 0.2367383163771406, "learning_rate": 1.3879513693915654e-05, "loss": 0.7881, "step": 2191 }, { "epoch": 0.603316589830042, "grad_norm": 0.2602317282075869, "learning_rate": 1.386288160096806e-05, "loss": 0.7609, "step": 2192 }, { "epoch": 0.6035918255005849, "grad_norm": 0.23248407453862296, "learning_rate": 1.384625419210639e-05, "loss": 0.7829, "step": 2193 }, { "epoch": 0.6038670611711278, "grad_norm": 0.24736366078303895, "learning_rate": 1.3829631480021335e-05, "loss": 0.7729, "step": 2194 }, { "epoch": 0.6041422968416706, "grad_norm": 0.251112860818329, "learning_rate": 1.3813013477399989e-05, "loss": 0.7754, "step": 2195 }, { "epoch": 0.6044175325122135, "grad_norm": 0.2375044017911047, "learning_rate": 1.3796400196925837e-05, "loss": 0.7754, "step": 2196 }, { "epoch": 0.6046927681827565, "grad_norm": 0.236547571630386, "learning_rate": 1.3779791651278802e-05, "loss": 0.7735, "step": 2197 }, { "epoch": 0.6049680038532994, "grad_norm": 0.25860613071581184, "learning_rate": 1.3763187853135156e-05, "loss": 0.797, "step": 2198 }, { "epoch": 0.6052432395238423, "grad_norm": 0.2299632050146302, "learning_rate": 1.3746588815167555e-05, "loss": 0.7889, "step": 2199 }, { "epoch": 0.6055184751943852, "grad_norm": 0.2538085408409037, "learning_rate": 1.3729994550045036e-05, "loss": 0.7933, "step": 2200 }, { "epoch": 0.6057937108649281, "grad_norm": 0.23846684024218845, "learning_rate": 1.3713405070432977e-05, "loss": 0.8148, "step": 2201 }, { "epoch": 0.6060689465354709, "grad_norm": 0.24883675188960985, "learning_rate": 1.369682038899311e-05, "loss": 0.7836, "step": 2202 }, { "epoch": 0.6063441822060139, "grad_norm": 0.2201432094251142, "learning_rate": 1.3680240518383502e-05, "loss": 0.75, "step": 2203 }, { "epoch": 0.6066194178765568, "grad_norm": 0.2442907241332848, "learning_rate": 1.3663665471258563e-05, "loss": 0.7948, "step": 2204 }, { "epoch": 0.6068946535470997, "grad_norm": 0.23524850760591698, "learning_rate": 1.3647095260268994e-05, "loss": 0.7797, "step": 2205 }, { "epoch": 0.6071698892176426, "grad_norm": 0.24134584500947526, "learning_rate": 1.3630529898061834e-05, "loss": 0.7888, "step": 2206 }, { "epoch": 0.6074451248881855, "grad_norm": 0.24028508919377853, "learning_rate": 1.3613969397280405e-05, "loss": 0.7939, "step": 2207 }, { "epoch": 0.6077203605587284, "grad_norm": 0.23614908810125707, "learning_rate": 1.3597413770564316e-05, "loss": 0.7802, "step": 2208 }, { "epoch": 0.6079955962292714, "grad_norm": 0.24320103802629342, "learning_rate": 1.3580863030549457e-05, "loss": 0.7559, "step": 2209 }, { "epoch": 0.6082708318998142, "grad_norm": 0.22571292589650693, "learning_rate": 1.3564317189868e-05, "loss": 0.7911, "step": 2210 }, { "epoch": 0.6085460675703571, "grad_norm": 0.24051990040004587, "learning_rate": 1.3547776261148366e-05, "loss": 0.7728, "step": 2211 }, { "epoch": 0.6088213032409, "grad_norm": 0.2267786033004235, "learning_rate": 1.3531240257015239e-05, "loss": 0.7923, "step": 2212 }, { "epoch": 0.6090965389114429, "grad_norm": 0.2493283813453356, "learning_rate": 1.351470919008953e-05, "loss": 0.7787, "step": 2213 }, { "epoch": 0.6093717745819858, "grad_norm": 0.22358057500853282, "learning_rate": 1.3498183072988391e-05, "loss": 0.7814, "step": 2214 }, { "epoch": 0.6096470102525288, "grad_norm": 0.3440976131242661, "learning_rate": 1.3481661918325185e-05, "loss": 0.753, "step": 2215 }, { "epoch": 0.6099222459230716, "grad_norm": 0.22715054868175705, "learning_rate": 1.3465145738709506e-05, "loss": 0.7793, "step": 2216 }, { "epoch": 0.6101974815936145, "grad_norm": 0.2325658496299765, "learning_rate": 1.3448634546747128e-05, "loss": 0.7593, "step": 2217 }, { "epoch": 0.6104727172641574, "grad_norm": 0.2322885351765859, "learning_rate": 1.3432128355040048e-05, "loss": 0.7619, "step": 2218 }, { "epoch": 0.6107479529347003, "grad_norm": 0.2528217887923534, "learning_rate": 1.341562717618642e-05, "loss": 0.7987, "step": 2219 }, { "epoch": 0.6110231886052433, "grad_norm": 0.2392500109552147, "learning_rate": 1.3399131022780578e-05, "loss": 0.7536, "step": 2220 }, { "epoch": 0.6112984242757862, "grad_norm": 0.2307629409968488, "learning_rate": 1.3382639907413033e-05, "loss": 0.7731, "step": 2221 }, { "epoch": 0.6115736599463291, "grad_norm": 0.2273021766595541, "learning_rate": 1.3366153842670433e-05, "loss": 0.7942, "step": 2222 }, { "epoch": 0.611848895616872, "grad_norm": 0.30423518686942114, "learning_rate": 1.3349672841135586e-05, "loss": 0.8187, "step": 2223 }, { "epoch": 0.6121241312874148, "grad_norm": 0.23659855677025482, "learning_rate": 1.3333196915387414e-05, "loss": 0.7969, "step": 2224 }, { "epoch": 0.6123993669579577, "grad_norm": 0.25569499276193625, "learning_rate": 1.3316726078001003e-05, "loss": 0.8072, "step": 2225 }, { "epoch": 0.6126746026285007, "grad_norm": 0.23298867836581347, "learning_rate": 1.3300260341547519e-05, "loss": 0.793, "step": 2226 }, { "epoch": 0.6129498382990436, "grad_norm": 0.2511880191579635, "learning_rate": 1.3283799718594255e-05, "loss": 0.7997, "step": 2227 }, { "epoch": 0.6132250739695865, "grad_norm": 0.2840334137563068, "learning_rate": 1.326734422170459e-05, "loss": 0.7826, "step": 2228 }, { "epoch": 0.6135003096401294, "grad_norm": 0.23042560639985768, "learning_rate": 1.3250893863437996e-05, "loss": 0.7754, "step": 2229 }, { "epoch": 0.6137755453106722, "grad_norm": 0.26078713076993054, "learning_rate": 1.3234448656350018e-05, "loss": 0.781, "step": 2230 }, { "epoch": 0.6140507809812151, "grad_norm": 0.23830661626654637, "learning_rate": 1.3218008612992279e-05, "loss": 0.7803, "step": 2231 }, { "epoch": 0.6143260166517581, "grad_norm": 0.2426937094871854, "learning_rate": 1.3201573745912453e-05, "loss": 0.7478, "step": 2232 }, { "epoch": 0.614601252322301, "grad_norm": 0.24652065812213447, "learning_rate": 1.3185144067654272e-05, "loss": 0.7812, "step": 2233 }, { "epoch": 0.6148764879928439, "grad_norm": 0.2589077154275541, "learning_rate": 1.3168719590757495e-05, "loss": 0.7913, "step": 2234 }, { "epoch": 0.6151517236633868, "grad_norm": 0.24197152701505842, "learning_rate": 1.315230032775792e-05, "loss": 0.8002, "step": 2235 }, { "epoch": 0.6154269593339297, "grad_norm": 0.2590383635111659, "learning_rate": 1.3135886291187356e-05, "loss": 0.7614, "step": 2236 }, { "epoch": 0.6157021950044725, "grad_norm": 0.24330646988687127, "learning_rate": 1.311947749357364e-05, "loss": 0.7548, "step": 2237 }, { "epoch": 0.6159774306750155, "grad_norm": 0.2416648534181719, "learning_rate": 1.3103073947440596e-05, "loss": 0.7805, "step": 2238 }, { "epoch": 0.6162526663455584, "grad_norm": 0.2441149733997987, "learning_rate": 1.308667566530804e-05, "loss": 0.7625, "step": 2239 }, { "epoch": 0.6165279020161013, "grad_norm": 0.2575334262557978, "learning_rate": 1.3070282659691782e-05, "loss": 0.7389, "step": 2240 }, { "epoch": 0.6168031376866442, "grad_norm": 0.24980075078996547, "learning_rate": 1.3053894943103598e-05, "loss": 0.7855, "step": 2241 }, { "epoch": 0.6170783733571871, "grad_norm": 0.25545253681161445, "learning_rate": 1.3037512528051217e-05, "loss": 0.737, "step": 2242 }, { "epoch": 0.61735360902773, "grad_norm": 0.24647019979471987, "learning_rate": 1.3021135427038342e-05, "loss": 0.8051, "step": 2243 }, { "epoch": 0.617628844698273, "grad_norm": 0.26064932550995573, "learning_rate": 1.3004763652564608e-05, "loss": 0.7591, "step": 2244 }, { "epoch": 0.6179040803688158, "grad_norm": 0.2558613974831646, "learning_rate": 1.2988397217125579e-05, "loss": 0.8032, "step": 2245 }, { "epoch": 0.6181793160393587, "grad_norm": 0.25087162428622795, "learning_rate": 1.2972036133212747e-05, "loss": 0.7973, "step": 2246 }, { "epoch": 0.6184545517099016, "grad_norm": 0.24663915403638537, "learning_rate": 1.295568041331354e-05, "loss": 0.7727, "step": 2247 }, { "epoch": 0.6187297873804445, "grad_norm": 0.26800782727748, "learning_rate": 1.2939330069911262e-05, "loss": 0.7799, "step": 2248 }, { "epoch": 0.6190050230509874, "grad_norm": 0.2410928295502244, "learning_rate": 1.2922985115485137e-05, "loss": 0.7862, "step": 2249 }, { "epoch": 0.6192802587215304, "grad_norm": 0.24724726149674725, "learning_rate": 1.2906645562510261e-05, "loss": 0.7871, "step": 2250 }, { "epoch": 0.6195554943920732, "grad_norm": 0.24635501240077184, "learning_rate": 1.2890311423457611e-05, "loss": 0.7993, "step": 2251 }, { "epoch": 0.6198307300626161, "grad_norm": 0.33680368145251793, "learning_rate": 1.2873982710794028e-05, "loss": 0.7655, "step": 2252 }, { "epoch": 0.620105965733159, "grad_norm": 0.2495871575314044, "learning_rate": 1.2857659436982224e-05, "loss": 0.7843, "step": 2253 }, { "epoch": 0.6203812014037019, "grad_norm": 0.22796133306734526, "learning_rate": 1.2841341614480752e-05, "loss": 0.784, "step": 2254 }, { "epoch": 0.6206564370742448, "grad_norm": 0.26664818691016473, "learning_rate": 1.2825029255744007e-05, "loss": 0.7715, "step": 2255 }, { "epoch": 0.6209316727447878, "grad_norm": 0.24331570868163313, "learning_rate": 1.2808722373222207e-05, "loss": 0.7999, "step": 2256 }, { "epoch": 0.6212069084153307, "grad_norm": 0.25101474771491167, "learning_rate": 1.2792420979361397e-05, "loss": 0.7864, "step": 2257 }, { "epoch": 0.6214821440858735, "grad_norm": 0.24211368156900676, "learning_rate": 1.2776125086603423e-05, "loss": 0.7847, "step": 2258 }, { "epoch": 0.6217573797564164, "grad_norm": 0.2503028888344997, "learning_rate": 1.2759834707385955e-05, "loss": 0.8151, "step": 2259 }, { "epoch": 0.6220326154269593, "grad_norm": 0.24545898853250367, "learning_rate": 1.2743549854142423e-05, "loss": 0.7952, "step": 2260 }, { "epoch": 0.6223078510975022, "grad_norm": 0.2553532121263847, "learning_rate": 1.2727270539302073e-05, "loss": 0.797, "step": 2261 }, { "epoch": 0.6225830867680452, "grad_norm": 0.2536941497255776, "learning_rate": 1.2710996775289898e-05, "loss": 0.7687, "step": 2262 }, { "epoch": 0.6228583224385881, "grad_norm": 0.23925111841565022, "learning_rate": 1.2694728574526662e-05, "loss": 0.7737, "step": 2263 }, { "epoch": 0.623133558109131, "grad_norm": 0.504686627809076, "learning_rate": 1.2678465949428893e-05, "loss": 0.7847, "step": 2264 }, { "epoch": 0.6234087937796738, "grad_norm": 0.24621864824989545, "learning_rate": 1.2662208912408847e-05, "loss": 0.7871, "step": 2265 }, { "epoch": 0.6236840294502167, "grad_norm": 0.2556656062357497, "learning_rate": 1.2645957475874526e-05, "loss": 0.7911, "step": 2266 }, { "epoch": 0.6239592651207596, "grad_norm": 0.24282479045729188, "learning_rate": 1.2629711652229646e-05, "loss": 0.7365, "step": 2267 }, { "epoch": 0.6242345007913026, "grad_norm": 0.2567016389692855, "learning_rate": 1.2613471453873665e-05, "loss": 0.7627, "step": 2268 }, { "epoch": 0.6245097364618455, "grad_norm": 0.2394284978741426, "learning_rate": 1.2597236893201712e-05, "loss": 0.8056, "step": 2269 }, { "epoch": 0.6247849721323884, "grad_norm": 0.2493709173100802, "learning_rate": 1.2581007982604648e-05, "loss": 0.7816, "step": 2270 }, { "epoch": 0.6250602078029313, "grad_norm": 0.25440439313110574, "learning_rate": 1.256478473446899e-05, "loss": 0.7622, "step": 2271 }, { "epoch": 0.6253354434734741, "grad_norm": 0.2294143326463229, "learning_rate": 1.2548567161176958e-05, "loss": 0.7481, "step": 2272 }, { "epoch": 0.625610679144017, "grad_norm": 0.24189268285585241, "learning_rate": 1.2532355275106422e-05, "loss": 0.7502, "step": 2273 }, { "epoch": 0.62588591481456, "grad_norm": 0.25155503113058514, "learning_rate": 1.2516149088630925e-05, "loss": 0.7783, "step": 2274 }, { "epoch": 0.6261611504851029, "grad_norm": 0.2438130760526398, "learning_rate": 1.2499948614119653e-05, "loss": 0.7848, "step": 2275 }, { "epoch": 0.6264363861556458, "grad_norm": 0.261405497193663, "learning_rate": 1.248375386393744e-05, "loss": 0.7661, "step": 2276 }, { "epoch": 0.6267116218261887, "grad_norm": 0.22946324746237765, "learning_rate": 1.246756485044474e-05, "loss": 0.7643, "step": 2277 }, { "epoch": 0.6269868574967316, "grad_norm": 0.2407726296582577, "learning_rate": 1.2451381585997636e-05, "loss": 0.7802, "step": 2278 }, { "epoch": 0.6272620931672744, "grad_norm": 0.23077959991271488, "learning_rate": 1.2435204082947814e-05, "loss": 0.8265, "step": 2279 }, { "epoch": 0.6275373288378174, "grad_norm": 0.23362024948011076, "learning_rate": 1.2419032353642578e-05, "loss": 0.7813, "step": 2280 }, { "epoch": 0.6278125645083603, "grad_norm": 0.23991786287094416, "learning_rate": 1.2402866410424807e-05, "loss": 0.7725, "step": 2281 }, { "epoch": 0.6280878001789032, "grad_norm": 0.22973482496427158, "learning_rate": 1.2386706265632986e-05, "loss": 0.79, "step": 2282 }, { "epoch": 0.6283630358494461, "grad_norm": 0.22992772662224578, "learning_rate": 1.2370551931601158e-05, "loss": 0.7672, "step": 2283 }, { "epoch": 0.628638271519989, "grad_norm": 0.24191214980316453, "learning_rate": 1.2354403420658931e-05, "loss": 0.7727, "step": 2284 }, { "epoch": 0.6289135071905319, "grad_norm": 0.24087795410816093, "learning_rate": 1.2338260745131474e-05, "loss": 0.7923, "step": 2285 }, { "epoch": 0.6291887428610748, "grad_norm": 0.2457715965991265, "learning_rate": 1.2322123917339504e-05, "loss": 0.8129, "step": 2286 }, { "epoch": 0.6294639785316177, "grad_norm": 0.2259408108952426, "learning_rate": 1.2305992949599266e-05, "loss": 0.8071, "step": 2287 }, { "epoch": 0.6297392142021606, "grad_norm": 0.2456367249921771, "learning_rate": 1.2289867854222543e-05, "loss": 0.7624, "step": 2288 }, { "epoch": 0.6300144498727035, "grad_norm": 0.3770905799807695, "learning_rate": 1.2273748643516623e-05, "loss": 0.758, "step": 2289 }, { "epoch": 0.6302896855432464, "grad_norm": 0.39131468106479894, "learning_rate": 1.2257635329784323e-05, "loss": 0.7878, "step": 2290 }, { "epoch": 0.6305649212137893, "grad_norm": 0.24483191205626265, "learning_rate": 1.2241527925323935e-05, "loss": 0.756, "step": 2291 }, { "epoch": 0.6308401568843323, "grad_norm": 0.2573993912840869, "learning_rate": 1.2225426442429265e-05, "loss": 0.8081, "step": 2292 }, { "epoch": 0.6311153925548751, "grad_norm": 0.41148143613137705, "learning_rate": 1.2209330893389577e-05, "loss": 0.8122, "step": 2293 }, { "epoch": 0.631390628225418, "grad_norm": 0.23272816544232663, "learning_rate": 1.2193241290489616e-05, "loss": 0.7875, "step": 2294 }, { "epoch": 0.6316658638959609, "grad_norm": 0.24651607793492109, "learning_rate": 1.2177157646009593e-05, "loss": 0.7904, "step": 2295 }, { "epoch": 0.6319410995665038, "grad_norm": 0.24265080357975752, "learning_rate": 1.2161079972225163e-05, "loss": 0.7822, "step": 2296 }, { "epoch": 0.6322163352370467, "grad_norm": 0.2784699788521609, "learning_rate": 1.2145008281407428e-05, "loss": 0.761, "step": 2297 }, { "epoch": 0.6324915709075897, "grad_norm": 0.2315799898253475, "learning_rate": 1.2128942585822933e-05, "loss": 0.7773, "step": 2298 }, { "epoch": 0.6327668065781326, "grad_norm": 0.23419047998663908, "learning_rate": 1.2112882897733634e-05, "loss": 0.7701, "step": 2299 }, { "epoch": 0.6330420422486754, "grad_norm": 0.23076610103056508, "learning_rate": 1.2096829229396895e-05, "loss": 0.7805, "step": 2300 }, { "epoch": 0.6333172779192183, "grad_norm": 0.22687340264460226, "learning_rate": 1.2080781593065503e-05, "loss": 0.7664, "step": 2301 }, { "epoch": 0.6335925135897612, "grad_norm": 0.22535323307122473, "learning_rate": 1.2064740000987638e-05, "loss": 0.7795, "step": 2302 }, { "epoch": 0.6338677492603041, "grad_norm": 0.2526634922138882, "learning_rate": 1.2048704465406854e-05, "loss": 0.7806, "step": 2303 }, { "epoch": 0.6341429849308471, "grad_norm": 0.2356073577776414, "learning_rate": 1.2032674998562101e-05, "loss": 0.7967, "step": 2304 }, { "epoch": 0.63441822060139, "grad_norm": 0.23507497560140406, "learning_rate": 1.2016651612687685e-05, "loss": 0.7769, "step": 2305 }, { "epoch": 0.6346934562719329, "grad_norm": 0.23090413448333263, "learning_rate": 1.2000634320013274e-05, "loss": 0.769, "step": 2306 }, { "epoch": 0.6349686919424757, "grad_norm": 0.28138726478972076, "learning_rate": 1.1984623132763873e-05, "loss": 0.7978, "step": 2307 }, { "epoch": 0.6352439276130186, "grad_norm": 0.24092751614783542, "learning_rate": 1.1968618063159859e-05, "loss": 0.7643, "step": 2308 }, { "epoch": 0.6355191632835615, "grad_norm": 0.2249205336894784, "learning_rate": 1.1952619123416903e-05, "loss": 0.7719, "step": 2309 }, { "epoch": 0.6357943989541045, "grad_norm": 0.2361541579458213, "learning_rate": 1.1936626325746015e-05, "loss": 0.7749, "step": 2310 }, { "epoch": 0.6360696346246474, "grad_norm": 0.2482843281120338, "learning_rate": 1.1920639682353529e-05, "loss": 0.7908, "step": 2311 }, { "epoch": 0.6363448702951903, "grad_norm": 0.24824941919000415, "learning_rate": 1.1904659205441061e-05, "loss": 0.8059, "step": 2312 }, { "epoch": 0.6366201059657332, "grad_norm": 0.23422129086053145, "learning_rate": 1.1888684907205527e-05, "loss": 0.7716, "step": 2313 }, { "epoch": 0.636895341636276, "grad_norm": 0.25211344670993874, "learning_rate": 1.1872716799839132e-05, "loss": 0.7719, "step": 2314 }, { "epoch": 0.6371705773068189, "grad_norm": 0.23532948350451352, "learning_rate": 1.1856754895529355e-05, "loss": 0.7822, "step": 2315 }, { "epoch": 0.6374458129773619, "grad_norm": 0.2406375927427003, "learning_rate": 1.1840799206458927e-05, "loss": 0.7701, "step": 2316 }, { "epoch": 0.6377210486479048, "grad_norm": 0.23401301404751848, "learning_rate": 1.1824849744805855e-05, "loss": 0.7846, "step": 2317 }, { "epoch": 0.6379962843184477, "grad_norm": 0.24204008195510776, "learning_rate": 1.1808906522743384e-05, "loss": 0.7773, "step": 2318 }, { "epoch": 0.6382715199889906, "grad_norm": 0.24039894750688456, "learning_rate": 1.1792969552439998e-05, "loss": 0.7635, "step": 2319 }, { "epoch": 0.6385467556595334, "grad_norm": 0.2362156141922952, "learning_rate": 1.1777038846059411e-05, "loss": 0.7736, "step": 2320 }, { "epoch": 0.6388219913300763, "grad_norm": 0.24775261630507306, "learning_rate": 1.176111441576055e-05, "loss": 0.7862, "step": 2321 }, { "epoch": 0.6390972270006193, "grad_norm": 0.23323874597649452, "learning_rate": 1.174519627369755e-05, "loss": 0.7715, "step": 2322 }, { "epoch": 0.6393724626711622, "grad_norm": 0.225345977039023, "learning_rate": 1.172928443201976e-05, "loss": 0.7648, "step": 2323 }, { "epoch": 0.6396476983417051, "grad_norm": 0.2395747303079639, "learning_rate": 1.1713378902871706e-05, "loss": 0.7797, "step": 2324 }, { "epoch": 0.639922934012248, "grad_norm": 0.24074112221048438, "learning_rate": 1.1697479698393112e-05, "loss": 0.7755, "step": 2325 }, { "epoch": 0.6401981696827909, "grad_norm": 0.23765364708162398, "learning_rate": 1.1681586830718862e-05, "loss": 0.7727, "step": 2326 }, { "epoch": 0.6404734053533337, "grad_norm": 0.24476468611717303, "learning_rate": 1.1665700311979e-05, "loss": 0.8085, "step": 2327 }, { "epoch": 0.6407486410238767, "grad_norm": 0.24529029491177157, "learning_rate": 1.1649820154298743e-05, "loss": 0.802, "step": 2328 }, { "epoch": 0.6410238766944196, "grad_norm": 0.24333979288255772, "learning_rate": 1.1633946369798426e-05, "loss": 0.7633, "step": 2329 }, { "epoch": 0.6412991123649625, "grad_norm": 0.2311984249345681, "learning_rate": 1.1618078970593544e-05, "loss": 0.7631, "step": 2330 }, { "epoch": 0.6415743480355054, "grad_norm": 0.2329726701246018, "learning_rate": 1.160221796879471e-05, "loss": 0.8027, "step": 2331 }, { "epoch": 0.6418495837060483, "grad_norm": 0.3710747252261914, "learning_rate": 1.1586363376507648e-05, "loss": 0.8146, "step": 2332 }, { "epoch": 0.6421248193765912, "grad_norm": 0.2454256887968694, "learning_rate": 1.1570515205833206e-05, "loss": 0.7871, "step": 2333 }, { "epoch": 0.6424000550471342, "grad_norm": 0.2277914598754184, "learning_rate": 1.1554673468867308e-05, "loss": 0.8097, "step": 2334 }, { "epoch": 0.642675290717677, "grad_norm": 0.23925652711046633, "learning_rate": 1.1538838177700993e-05, "loss": 0.8003, "step": 2335 }, { "epoch": 0.6429505263882199, "grad_norm": 0.25000951173695574, "learning_rate": 1.1523009344420348e-05, "loss": 0.771, "step": 2336 }, { "epoch": 0.6432257620587628, "grad_norm": 0.24575782003591679, "learning_rate": 1.1507186981106564e-05, "loss": 0.7749, "step": 2337 }, { "epoch": 0.6435009977293057, "grad_norm": 0.24094055600625075, "learning_rate": 1.1491371099835886e-05, "loss": 0.7525, "step": 2338 }, { "epoch": 0.6437762333998486, "grad_norm": 0.23616841299124244, "learning_rate": 1.1475561712679582e-05, "loss": 0.7947, "step": 2339 }, { "epoch": 0.6440514690703916, "grad_norm": 0.22506594021173423, "learning_rate": 1.1459758831704018e-05, "loss": 0.7787, "step": 2340 }, { "epoch": 0.6443267047409345, "grad_norm": 0.24818684843520203, "learning_rate": 1.144396246897054e-05, "loss": 0.7648, "step": 2341 }, { "epoch": 0.6446019404114773, "grad_norm": 0.22244800818479413, "learning_rate": 1.1428172636535551e-05, "loss": 0.7663, "step": 2342 }, { "epoch": 0.6448771760820202, "grad_norm": 0.24281419515466754, "learning_rate": 1.1412389346450468e-05, "loss": 0.7654, "step": 2343 }, { "epoch": 0.6451524117525631, "grad_norm": 0.22878715467206565, "learning_rate": 1.1396612610761695e-05, "loss": 0.7773, "step": 2344 }, { "epoch": 0.645427647423106, "grad_norm": 0.24416688028267866, "learning_rate": 1.1380842441510658e-05, "loss": 0.7923, "step": 2345 }, { "epoch": 0.645702883093649, "grad_norm": 0.2303559071426893, "learning_rate": 1.1365078850733738e-05, "loss": 0.7865, "step": 2346 }, { "epoch": 0.6459781187641919, "grad_norm": 0.23975013965658076, "learning_rate": 1.1349321850462342e-05, "loss": 0.8106, "step": 2347 }, { "epoch": 0.6462533544347348, "grad_norm": 0.23104271924792577, "learning_rate": 1.133357145272282e-05, "loss": 0.7852, "step": 2348 }, { "epoch": 0.6465285901052776, "grad_norm": 0.23086857881356623, "learning_rate": 1.1317827669536467e-05, "loss": 0.7859, "step": 2349 }, { "epoch": 0.6468038257758205, "grad_norm": 0.2872589332126486, "learning_rate": 1.1302090512919564e-05, "loss": 0.7876, "step": 2350 }, { "epoch": 0.6470790614463634, "grad_norm": 0.2306238044182807, "learning_rate": 1.1286359994883302e-05, "loss": 0.7667, "step": 2351 }, { "epoch": 0.6473542971169064, "grad_norm": 0.32793517018662044, "learning_rate": 1.1270636127433827e-05, "loss": 0.784, "step": 2352 }, { "epoch": 0.6476295327874493, "grad_norm": 0.23372195785710242, "learning_rate": 1.1254918922572205e-05, "loss": 0.7831, "step": 2353 }, { "epoch": 0.6479047684579922, "grad_norm": 0.2354023056923587, "learning_rate": 1.1239208392294406e-05, "loss": 0.7985, "step": 2354 }, { "epoch": 0.648180004128535, "grad_norm": 0.2689494954821914, "learning_rate": 1.122350454859133e-05, "loss": 0.7995, "step": 2355 }, { "epoch": 0.6484552397990779, "grad_norm": 0.22864863268338262, "learning_rate": 1.1207807403448742e-05, "loss": 0.7862, "step": 2356 }, { "epoch": 0.6487304754696208, "grad_norm": 0.22877383469670426, "learning_rate": 1.1192116968847313e-05, "loss": 0.7657, "step": 2357 }, { "epoch": 0.6490057111401638, "grad_norm": 0.24724173580536435, "learning_rate": 1.11764332567626e-05, "loss": 0.8074, "step": 2358 }, { "epoch": 0.6492809468107067, "grad_norm": 0.22760314279807323, "learning_rate": 1.1160756279164996e-05, "loss": 0.7546, "step": 2359 }, { "epoch": 0.6495561824812496, "grad_norm": 0.24002453943654778, "learning_rate": 1.1145086048019795e-05, "loss": 0.7826, "step": 2360 }, { "epoch": 0.6498314181517925, "grad_norm": 0.2140653012670485, "learning_rate": 1.1129422575287116e-05, "loss": 0.7602, "step": 2361 }, { "epoch": 0.6501066538223353, "grad_norm": 0.23523993899709766, "learning_rate": 1.1113765872921933e-05, "loss": 0.746, "step": 2362 }, { "epoch": 0.6503818894928782, "grad_norm": 0.23171924831741408, "learning_rate": 1.1098115952874036e-05, "loss": 0.7613, "step": 2363 }, { "epoch": 0.6506571251634212, "grad_norm": 0.2534435156716113, "learning_rate": 1.1082472827088053e-05, "loss": 0.8077, "step": 2364 }, { "epoch": 0.6509323608339641, "grad_norm": 0.23618127991487797, "learning_rate": 1.1066836507503428e-05, "loss": 0.7812, "step": 2365 }, { "epoch": 0.651207596504507, "grad_norm": 0.24245911206849247, "learning_rate": 1.1051207006054394e-05, "loss": 0.7854, "step": 2366 }, { "epoch": 0.6514828321750499, "grad_norm": 0.22079706440775906, "learning_rate": 1.1035584334669998e-05, "loss": 0.7984, "step": 2367 }, { "epoch": 0.6517580678455928, "grad_norm": 0.24154293030207943, "learning_rate": 1.101996850527406e-05, "loss": 0.7635, "step": 2368 }, { "epoch": 0.6520333035161356, "grad_norm": 0.21961355942826122, "learning_rate": 1.1004359529785194e-05, "loss": 0.7791, "step": 2369 }, { "epoch": 0.6523085391866786, "grad_norm": 0.2394960830275329, "learning_rate": 1.0988757420116771e-05, "loss": 0.7948, "step": 2370 }, { "epoch": 0.6525837748572215, "grad_norm": 0.22316095598405364, "learning_rate": 1.0973162188176915e-05, "loss": 0.7866, "step": 2371 }, { "epoch": 0.6528590105277644, "grad_norm": 0.23774132487002664, "learning_rate": 1.0957573845868525e-05, "loss": 0.7915, "step": 2372 }, { "epoch": 0.6531342461983073, "grad_norm": 0.24861044095870596, "learning_rate": 1.0941992405089209e-05, "loss": 0.8048, "step": 2373 }, { "epoch": 0.6534094818688502, "grad_norm": 0.22040595195310883, "learning_rate": 1.092641787773133e-05, "loss": 0.7828, "step": 2374 }, { "epoch": 0.6536847175393931, "grad_norm": 0.24109090382083664, "learning_rate": 1.0910850275681974e-05, "loss": 0.7785, "step": 2375 }, { "epoch": 0.653959953209936, "grad_norm": 0.23803836777273013, "learning_rate": 1.0895289610822935e-05, "loss": 0.7592, "step": 2376 }, { "epoch": 0.6542351888804789, "grad_norm": 0.23187521600298203, "learning_rate": 1.087973589503072e-05, "loss": 0.7836, "step": 2377 }, { "epoch": 0.6545104245510218, "grad_norm": 0.23309562053529873, "learning_rate": 1.0864189140176512e-05, "loss": 0.7766, "step": 2378 }, { "epoch": 0.6547856602215647, "grad_norm": 0.22371015381882509, "learning_rate": 1.0848649358126205e-05, "loss": 0.7896, "step": 2379 }, { "epoch": 0.6550608958921076, "grad_norm": 0.23349125197890194, "learning_rate": 1.0833116560740361e-05, "loss": 0.7665, "step": 2380 }, { "epoch": 0.6553361315626505, "grad_norm": 0.22837551334809192, "learning_rate": 1.0817590759874194e-05, "loss": 0.7783, "step": 2381 }, { "epoch": 0.6556113672331935, "grad_norm": 0.23338696297514103, "learning_rate": 1.080207196737763e-05, "loss": 0.7719, "step": 2382 }, { "epoch": 0.6558866029037363, "grad_norm": 0.23425031111914862, "learning_rate": 1.0786560195095181e-05, "loss": 0.7842, "step": 2383 }, { "epoch": 0.6561618385742792, "grad_norm": 0.21035023614697532, "learning_rate": 1.0771055454866048e-05, "loss": 0.7708, "step": 2384 }, { "epoch": 0.6564370742448221, "grad_norm": 0.23647780094331225, "learning_rate": 1.0755557758524033e-05, "loss": 0.7643, "step": 2385 }, { "epoch": 0.656712309915365, "grad_norm": 0.22346276087431097, "learning_rate": 1.0740067117897586e-05, "loss": 0.7624, "step": 2386 }, { "epoch": 0.6569875455859079, "grad_norm": 0.23977669174938704, "learning_rate": 1.0724583544809768e-05, "loss": 0.799, "step": 2387 }, { "epoch": 0.6572627812564509, "grad_norm": 0.22182448648270314, "learning_rate": 1.0709107051078221e-05, "loss": 0.7723, "step": 2388 }, { "epoch": 0.6575380169269938, "grad_norm": 0.2194907430601821, "learning_rate": 1.0693637648515228e-05, "loss": 0.7838, "step": 2389 }, { "epoch": 0.6578132525975366, "grad_norm": 0.229976596291359, "learning_rate": 1.0678175348927615e-05, "loss": 0.7704, "step": 2390 }, { "epoch": 0.6580884882680795, "grad_norm": 0.2211099114616119, "learning_rate": 1.0662720164116815e-05, "loss": 0.7609, "step": 2391 }, { "epoch": 0.6583637239386224, "grad_norm": 0.2180251462179224, "learning_rate": 1.0647272105878833e-05, "loss": 0.7689, "step": 2392 }, { "epoch": 0.6586389596091653, "grad_norm": 0.2203189616623406, "learning_rate": 1.06318311860042e-05, "loss": 0.7471, "step": 2393 }, { "epoch": 0.6589141952797083, "grad_norm": 0.22680169811459414, "learning_rate": 1.0616397416278046e-05, "loss": 0.777, "step": 2394 }, { "epoch": 0.6591894309502512, "grad_norm": 0.22476041046566922, "learning_rate": 1.0600970808479997e-05, "loss": 0.7878, "step": 2395 }, { "epoch": 0.6594646666207941, "grad_norm": 0.2453410077918203, "learning_rate": 1.0585551374384246e-05, "loss": 0.7492, "step": 2396 }, { "epoch": 0.6597399022913369, "grad_norm": 0.22364329816030826, "learning_rate": 1.0570139125759518e-05, "loss": 0.7596, "step": 2397 }, { "epoch": 0.6600151379618798, "grad_norm": 0.2437942537641047, "learning_rate": 1.0554734074369017e-05, "loss": 0.7816, "step": 2398 }, { "epoch": 0.6602903736324227, "grad_norm": 0.2217240731203041, "learning_rate": 1.0539336231970485e-05, "loss": 0.7559, "step": 2399 }, { "epoch": 0.6605656093029657, "grad_norm": 0.23474539493899937, "learning_rate": 1.0523945610316138e-05, "loss": 0.7722, "step": 2400 }, { "epoch": 0.6608408449735086, "grad_norm": 0.23905008315022286, "learning_rate": 1.0508562221152699e-05, "loss": 0.7981, "step": 2401 }, { "epoch": 0.6611160806440515, "grad_norm": 0.2179803554679985, "learning_rate": 1.0493186076221376e-05, "loss": 0.7887, "step": 2402 }, { "epoch": 0.6613913163145944, "grad_norm": 0.30493275613170756, "learning_rate": 1.0477817187257809e-05, "loss": 0.7689, "step": 2403 }, { "epoch": 0.6616665519851372, "grad_norm": 0.21878972190937115, "learning_rate": 1.0462455565992161e-05, "loss": 0.778, "step": 2404 }, { "epoch": 0.6619417876556801, "grad_norm": 0.21404023319142085, "learning_rate": 1.0447101224148994e-05, "loss": 0.7717, "step": 2405 }, { "epoch": 0.6622170233262231, "grad_norm": 0.22796253040553002, "learning_rate": 1.043175417344734e-05, "loss": 0.7785, "step": 2406 }, { "epoch": 0.662492258996766, "grad_norm": 0.2276333695329629, "learning_rate": 1.041641442560067e-05, "loss": 0.7638, "step": 2407 }, { "epoch": 0.6627674946673089, "grad_norm": 0.21183330904074812, "learning_rate": 1.0401081992316857e-05, "loss": 0.7583, "step": 2408 }, { "epoch": 0.6630427303378518, "grad_norm": 0.2492603438225398, "learning_rate": 1.038575688529822e-05, "loss": 0.7733, "step": 2409 }, { "epoch": 0.6633179660083947, "grad_norm": 0.2280886545169572, "learning_rate": 1.0370439116241455e-05, "loss": 0.8024, "step": 2410 }, { "epoch": 0.6635932016789375, "grad_norm": 0.46669864005277895, "learning_rate": 1.0355128696837702e-05, "loss": 0.7827, "step": 2411 }, { "epoch": 0.6638684373494805, "grad_norm": 0.2364701944158192, "learning_rate": 1.033982563877244e-05, "loss": 0.7802, "step": 2412 }, { "epoch": 0.6641436730200234, "grad_norm": 0.2371027365459466, "learning_rate": 1.0324529953725568e-05, "loss": 0.8017, "step": 2413 }, { "epoch": 0.6644189086905663, "grad_norm": 0.2358545000830875, "learning_rate": 1.0309241653371347e-05, "loss": 0.7668, "step": 2414 }, { "epoch": 0.6646941443611092, "grad_norm": 0.22808108263937973, "learning_rate": 1.0293960749378384e-05, "loss": 0.7726, "step": 2415 }, { "epoch": 0.6649693800316521, "grad_norm": 0.22541924299814917, "learning_rate": 1.0278687253409662e-05, "loss": 0.7537, "step": 2416 }, { "epoch": 0.665244615702195, "grad_norm": 0.2477169909305548, "learning_rate": 1.0263421177122505e-05, "loss": 0.7952, "step": 2417 }, { "epoch": 0.665519851372738, "grad_norm": 0.23406721745364348, "learning_rate": 1.0248162532168574e-05, "loss": 0.799, "step": 2418 }, { "epoch": 0.6657950870432808, "grad_norm": 0.22671711008442896, "learning_rate": 1.0232911330193861e-05, "loss": 0.7721, "step": 2419 }, { "epoch": 0.6660703227138237, "grad_norm": 0.23392243064526896, "learning_rate": 1.021766758283866e-05, "loss": 0.7963, "step": 2420 }, { "epoch": 0.6663455583843666, "grad_norm": 0.22942307844003806, "learning_rate": 1.02024313017376e-05, "loss": 0.7507, "step": 2421 }, { "epoch": 0.6666207940549095, "grad_norm": 0.21580367183403693, "learning_rate": 1.0187202498519588e-05, "loss": 0.7794, "step": 2422 }, { "epoch": 0.6668960297254524, "grad_norm": 0.24088443619876312, "learning_rate": 1.017198118480784e-05, "loss": 0.7978, "step": 2423 }, { "epoch": 0.6671712653959954, "grad_norm": 0.21582027508119267, "learning_rate": 1.0156767372219854e-05, "loss": 0.7913, "step": 2424 }, { "epoch": 0.6674465010665382, "grad_norm": 0.2397917395336763, "learning_rate": 1.0141561072367396e-05, "loss": 0.7794, "step": 2425 }, { "epoch": 0.6677217367370811, "grad_norm": 0.23306780283956408, "learning_rate": 1.0126362296856511e-05, "loss": 0.7555, "step": 2426 }, { "epoch": 0.667996972407624, "grad_norm": 0.22743305802924532, "learning_rate": 1.0111171057287477e-05, "loss": 0.7534, "step": 2427 }, { "epoch": 0.6682722080781669, "grad_norm": 0.22907358964366473, "learning_rate": 1.0095987365254843e-05, "loss": 0.766, "step": 2428 }, { "epoch": 0.6685474437487098, "grad_norm": 0.22599697983218686, "learning_rate": 1.0080811232347396e-05, "loss": 0.7926, "step": 2429 }, { "epoch": 0.6688226794192528, "grad_norm": 0.2288149061038424, "learning_rate": 1.006564267014813e-05, "loss": 0.7393, "step": 2430 }, { "epoch": 0.6690979150897957, "grad_norm": 0.2316541769565441, "learning_rate": 1.005048169023429e-05, "loss": 0.7778, "step": 2431 }, { "epoch": 0.6693731507603385, "grad_norm": 0.23462334035713, "learning_rate": 1.003532830417732e-05, "loss": 0.7878, "step": 2432 }, { "epoch": 0.6696483864308814, "grad_norm": 0.22281533873076556, "learning_rate": 1.0020182523542869e-05, "loss": 0.7815, "step": 2433 }, { "epoch": 0.6699236221014243, "grad_norm": 0.2309986177915826, "learning_rate": 1.000504435989079e-05, "loss": 0.7658, "step": 2434 }, { "epoch": 0.6701988577719672, "grad_norm": 0.22486743102674223, "learning_rate": 9.9899138247751e-06, "loss": 0.7823, "step": 2435 }, { "epoch": 0.6704740934425102, "grad_norm": 0.23502472525632212, "learning_rate": 9.974790929744021e-06, "loss": 0.7657, "step": 2436 }, { "epoch": 0.6707493291130531, "grad_norm": 0.24519665537909452, "learning_rate": 9.959675686339918e-06, "loss": 0.7782, "step": 2437 }, { "epoch": 0.671024564783596, "grad_norm": 0.22979213514478425, "learning_rate": 9.944568106099336e-06, "loss": 0.7671, "step": 2438 }, { "epoch": 0.6712998004541388, "grad_norm": 0.25216614197100384, "learning_rate": 9.929468200552963e-06, "loss": 0.789, "step": 2439 }, { "epoch": 0.6715750361246817, "grad_norm": 0.23075737764323775, "learning_rate": 9.914375981225632e-06, "loss": 0.7888, "step": 2440 }, { "epoch": 0.6718502717952246, "grad_norm": 0.22458144886045955, "learning_rate": 9.899291459636316e-06, "loss": 0.7749, "step": 2441 }, { "epoch": 0.6721255074657676, "grad_norm": 0.23322259378345364, "learning_rate": 9.884214647298087e-06, "loss": 0.7985, "step": 2442 }, { "epoch": 0.6724007431363105, "grad_norm": 0.23492890404689232, "learning_rate": 9.869145555718162e-06, "loss": 0.7948, "step": 2443 }, { "epoch": 0.6726759788068534, "grad_norm": 0.22469135354354047, "learning_rate": 9.854084196397859e-06, "loss": 0.7704, "step": 2444 }, { "epoch": 0.6729512144773963, "grad_norm": 0.2220893929783055, "learning_rate": 9.839030580832573e-06, "loss": 0.776, "step": 2445 }, { "epoch": 0.6732264501479391, "grad_norm": 0.23533516873712618, "learning_rate": 9.823984720511816e-06, "loss": 0.7762, "step": 2446 }, { "epoch": 0.673501685818482, "grad_norm": 0.2206522024450872, "learning_rate": 9.808946626919172e-06, "loss": 0.8001, "step": 2447 }, { "epoch": 0.673776921489025, "grad_norm": 0.2253775746862108, "learning_rate": 9.793916311532294e-06, "loss": 0.8135, "step": 2448 }, { "epoch": 0.6740521571595679, "grad_norm": 0.2269850088529325, "learning_rate": 9.778893785822894e-06, "loss": 0.8209, "step": 2449 }, { "epoch": 0.6743273928301108, "grad_norm": 0.3225439821012345, "learning_rate": 9.763879061256744e-06, "loss": 0.7663, "step": 2450 }, { "epoch": 0.6746026285006537, "grad_norm": 0.21724346561672142, "learning_rate": 9.748872149293678e-06, "loss": 0.7899, "step": 2451 }, { "epoch": 0.6748778641711966, "grad_norm": 0.2376174096802007, "learning_rate": 9.733873061387527e-06, "loss": 0.7699, "step": 2452 }, { "epoch": 0.6751530998417394, "grad_norm": 0.21132385126114916, "learning_rate": 9.718881808986186e-06, "loss": 0.7823, "step": 2453 }, { "epoch": 0.6754283355122824, "grad_norm": 0.21257845733422154, "learning_rate": 9.703898403531561e-06, "loss": 0.7415, "step": 2454 }, { "epoch": 0.6757035711828253, "grad_norm": 0.21390548304356413, "learning_rate": 9.688922856459563e-06, "loss": 0.7637, "step": 2455 }, { "epoch": 0.6759788068533682, "grad_norm": 0.21537758501429546, "learning_rate": 9.673955179200116e-06, "loss": 0.7669, "step": 2456 }, { "epoch": 0.6762540425239111, "grad_norm": 0.22287252512032207, "learning_rate": 9.658995383177114e-06, "loss": 0.7623, "step": 2457 }, { "epoch": 0.676529278194454, "grad_norm": 0.3495194670430522, "learning_rate": 9.64404347980847e-06, "loss": 0.7977, "step": 2458 }, { "epoch": 0.6768045138649968, "grad_norm": 0.21585522136163524, "learning_rate": 9.629099480506034e-06, "loss": 0.7675, "step": 2459 }, { "epoch": 0.6770797495355398, "grad_norm": 0.22049895208382783, "learning_rate": 9.614163396675657e-06, "loss": 0.7688, "step": 2460 }, { "epoch": 0.6773549852060827, "grad_norm": 0.22841228167081598, "learning_rate": 9.599235239717131e-06, "loss": 0.7805, "step": 2461 }, { "epoch": 0.6776302208766256, "grad_norm": 0.20867410171905978, "learning_rate": 9.584315021024205e-06, "loss": 0.766, "step": 2462 }, { "epoch": 0.6779054565471685, "grad_norm": 0.21957400992775994, "learning_rate": 9.56940275198457e-06, "loss": 0.7574, "step": 2463 }, { "epoch": 0.6781806922177114, "grad_norm": 0.22692495941500024, "learning_rate": 9.554498443979837e-06, "loss": 0.7628, "step": 2464 }, { "epoch": 0.6784559278882544, "grad_norm": 0.21468268215342132, "learning_rate": 9.539602108385551e-06, "loss": 0.7595, "step": 2465 }, { "epoch": 0.6787311635587973, "grad_norm": 0.22226371709403886, "learning_rate": 9.524713756571185e-06, "loss": 0.7792, "step": 2466 }, { "epoch": 0.6790063992293401, "grad_norm": 0.2231191835609718, "learning_rate": 9.509833399900076e-06, "loss": 0.789, "step": 2467 }, { "epoch": 0.679281634899883, "grad_norm": 0.2230108118422868, "learning_rate": 9.494961049729521e-06, "loss": 0.7615, "step": 2468 }, { "epoch": 0.6795568705704259, "grad_norm": 0.2189954800987077, "learning_rate": 9.480096717410647e-06, "loss": 0.7934, "step": 2469 }, { "epoch": 0.6798321062409688, "grad_norm": 0.21776374077364447, "learning_rate": 9.465240414288505e-06, "loss": 0.7803, "step": 2470 }, { "epoch": 0.6801073419115118, "grad_norm": 0.21921862823179844, "learning_rate": 9.450392151701983e-06, "loss": 0.7754, "step": 2471 }, { "epoch": 0.6803825775820547, "grad_norm": 0.2241906980218283, "learning_rate": 9.435551940983859e-06, "loss": 0.7765, "step": 2472 }, { "epoch": 0.6806578132525976, "grad_norm": 0.2253310478368545, "learning_rate": 9.420719793460758e-06, "loss": 0.795, "step": 2473 }, { "epoch": 0.6809330489231404, "grad_norm": 0.2173925233300184, "learning_rate": 9.405895720453128e-06, "loss": 0.7785, "step": 2474 }, { "epoch": 0.6812082845936833, "grad_norm": 0.30708324943746157, "learning_rate": 9.391079733275306e-06, "loss": 0.775, "step": 2475 }, { "epoch": 0.6814835202642262, "grad_norm": 0.22534096606040097, "learning_rate": 9.3762718432354e-06, "loss": 0.8064, "step": 2476 }, { "epoch": 0.6817587559347692, "grad_norm": 0.2195259041646853, "learning_rate": 9.361472061635374e-06, "loss": 0.7918, "step": 2477 }, { "epoch": 0.6820339916053121, "grad_norm": 0.2100772620504874, "learning_rate": 9.346680399771003e-06, "loss": 0.7758, "step": 2478 }, { "epoch": 0.682309227275855, "grad_norm": 0.2116302368744064, "learning_rate": 9.331896868931834e-06, "loss": 0.7545, "step": 2479 }, { "epoch": 0.6825844629463979, "grad_norm": 0.221247983582461, "learning_rate": 9.317121480401245e-06, "loss": 0.7725, "step": 2480 }, { "epoch": 0.6828596986169407, "grad_norm": 0.21351777609821598, "learning_rate": 9.302354245456367e-06, "loss": 0.772, "step": 2481 }, { "epoch": 0.6831349342874836, "grad_norm": 0.22766266731837248, "learning_rate": 9.287595175368143e-06, "loss": 0.7588, "step": 2482 }, { "epoch": 0.6834101699580266, "grad_norm": 0.22936644820810378, "learning_rate": 9.272844281401263e-06, "loss": 0.7675, "step": 2483 }, { "epoch": 0.6836854056285695, "grad_norm": 0.3757291981749503, "learning_rate": 9.25810157481417e-06, "loss": 0.7857, "step": 2484 }, { "epoch": 0.6839606412991124, "grad_norm": 0.23193772438613108, "learning_rate": 9.243367066859077e-06, "loss": 0.7793, "step": 2485 }, { "epoch": 0.6842358769696553, "grad_norm": 0.24922646157771597, "learning_rate": 9.228640768781919e-06, "loss": 0.7559, "step": 2486 }, { "epoch": 0.6845111126401981, "grad_norm": 0.22382613600955226, "learning_rate": 9.21392269182238e-06, "loss": 0.7648, "step": 2487 }, { "epoch": 0.684786348310741, "grad_norm": 0.2386437090117118, "learning_rate": 9.199212847213866e-06, "loss": 0.7733, "step": 2488 }, { "epoch": 0.685061583981284, "grad_norm": 0.22702347702387204, "learning_rate": 9.1845112461835e-06, "loss": 0.7695, "step": 2489 }, { "epoch": 0.6853368196518269, "grad_norm": 0.2560279503686138, "learning_rate": 9.16981789995212e-06, "loss": 0.802, "step": 2490 }, { "epoch": 0.6856120553223698, "grad_norm": 0.22291033943009855, "learning_rate": 9.15513281973424e-06, "loss": 0.785, "step": 2491 }, { "epoch": 0.6858872909929127, "grad_norm": 0.218627646927319, "learning_rate": 9.140456016738086e-06, "loss": 0.7469, "step": 2492 }, { "epoch": 0.6861625266634556, "grad_norm": 0.23721128902782118, "learning_rate": 9.125787502165573e-06, "loss": 0.7786, "step": 2493 }, { "epoch": 0.6864377623339984, "grad_norm": 0.2233248862456069, "learning_rate": 9.11112728721226e-06, "loss": 0.7737, "step": 2494 }, { "epoch": 0.6867129980045414, "grad_norm": 0.21617977016839804, "learning_rate": 9.096475383067398e-06, "loss": 0.7729, "step": 2495 }, { "epoch": 0.6869882336750843, "grad_norm": 0.2324198017017012, "learning_rate": 9.081831800913885e-06, "loss": 0.8005, "step": 2496 }, { "epoch": 0.6872634693456272, "grad_norm": 0.4091011695050855, "learning_rate": 9.067196551928279e-06, "loss": 0.8117, "step": 2497 }, { "epoch": 0.6875387050161701, "grad_norm": 0.22409578706629474, "learning_rate": 9.05256964728075e-06, "loss": 0.7565, "step": 2498 }, { "epoch": 0.687813940686713, "grad_norm": 0.21561504830453673, "learning_rate": 9.03795109813513e-06, "loss": 0.784, "step": 2499 }, { "epoch": 0.6880891763572559, "grad_norm": 0.3939017904240304, "learning_rate": 9.02334091564886e-06, "loss": 0.8239, "step": 2500 }, { "epoch": 0.6883644120277989, "grad_norm": 0.21486174978119335, "learning_rate": 9.008739110972986e-06, "loss": 0.7842, "step": 2501 }, { "epoch": 0.6886396476983417, "grad_norm": 0.21701146730722043, "learning_rate": 8.994145695252174e-06, "loss": 0.7635, "step": 2502 }, { "epoch": 0.6889148833688846, "grad_norm": 0.3449154128415657, "learning_rate": 8.979560679624687e-06, "loss": 0.7787, "step": 2503 }, { "epoch": 0.6891901190394275, "grad_norm": 0.22169093987856153, "learning_rate": 8.964984075222368e-06, "loss": 0.7618, "step": 2504 }, { "epoch": 0.6894653547099704, "grad_norm": 0.2206818582087166, "learning_rate": 8.950415893170657e-06, "loss": 0.7735, "step": 2505 }, { "epoch": 0.6897405903805133, "grad_norm": 0.2345879175576268, "learning_rate": 8.935856144588532e-06, "loss": 0.7689, "step": 2506 }, { "epoch": 0.6900158260510563, "grad_norm": 0.21893015317455772, "learning_rate": 8.921304840588578e-06, "loss": 0.7737, "step": 2507 }, { "epoch": 0.6902910617215992, "grad_norm": 0.22027550928832026, "learning_rate": 8.906761992276893e-06, "loss": 0.7777, "step": 2508 }, { "epoch": 0.690566297392142, "grad_norm": 0.2435470123997399, "learning_rate": 8.89222761075315e-06, "loss": 0.7964, "step": 2509 }, { "epoch": 0.6908415330626849, "grad_norm": 0.2196593302073358, "learning_rate": 8.87770170711055e-06, "loss": 0.75, "step": 2510 }, { "epoch": 0.6911167687332278, "grad_norm": 0.21792435152172385, "learning_rate": 8.863184292435828e-06, "loss": 0.7402, "step": 2511 }, { "epoch": 0.6913920044037707, "grad_norm": 0.2235878784830696, "learning_rate": 8.848675377809235e-06, "loss": 0.7886, "step": 2512 }, { "epoch": 0.6916672400743137, "grad_norm": 0.2302819263673517, "learning_rate": 8.834174974304526e-06, "loss": 0.7951, "step": 2513 }, { "epoch": 0.6919424757448566, "grad_norm": 0.22502686844395442, "learning_rate": 8.819683092988978e-06, "loss": 0.7842, "step": 2514 }, { "epoch": 0.6922177114153995, "grad_norm": 0.22555359566475297, "learning_rate": 8.805199744923356e-06, "loss": 0.7856, "step": 2515 }, { "epoch": 0.6924929470859423, "grad_norm": 0.21195604623711484, "learning_rate": 8.790724941161904e-06, "loss": 0.7728, "step": 2516 }, { "epoch": 0.6927681827564852, "grad_norm": 0.23149331987418773, "learning_rate": 8.776258692752355e-06, "loss": 0.7898, "step": 2517 }, { "epoch": 0.6930434184270281, "grad_norm": 0.227401694556156, "learning_rate": 8.761801010735906e-06, "loss": 0.7655, "step": 2518 }, { "epoch": 0.6933186540975711, "grad_norm": 0.21109232570009917, "learning_rate": 8.747351906147225e-06, "loss": 0.7716, "step": 2519 }, { "epoch": 0.693593889768114, "grad_norm": 0.2207805835482109, "learning_rate": 8.73291139001443e-06, "loss": 0.7424, "step": 2520 }, { "epoch": 0.6938691254386569, "grad_norm": 0.24099588906668826, "learning_rate": 8.718479473359067e-06, "loss": 0.7848, "step": 2521 }, { "epoch": 0.6941443611091997, "grad_norm": 0.21952719725201358, "learning_rate": 8.704056167196148e-06, "loss": 0.7934, "step": 2522 }, { "epoch": 0.6944195967797426, "grad_norm": 0.22370261913303857, "learning_rate": 8.689641482534083e-06, "loss": 0.7637, "step": 2523 }, { "epoch": 0.6946948324502855, "grad_norm": 0.2238425402607639, "learning_rate": 8.675235430374722e-06, "loss": 0.7738, "step": 2524 }, { "epoch": 0.6949700681208285, "grad_norm": 0.30398606659718463, "learning_rate": 8.660838021713323e-06, "loss": 0.807, "step": 2525 }, { "epoch": 0.6952453037913714, "grad_norm": 0.22191070111807776, "learning_rate": 8.646449267538544e-06, "loss": 0.7752, "step": 2526 }, { "epoch": 0.6955205394619143, "grad_norm": 0.22203322308020254, "learning_rate": 8.632069178832445e-06, "loss": 0.7415, "step": 2527 }, { "epoch": 0.6957957751324572, "grad_norm": 0.23590067454635755, "learning_rate": 8.617697766570449e-06, "loss": 0.7796, "step": 2528 }, { "epoch": 0.696071010803, "grad_norm": 0.22047356060959447, "learning_rate": 8.603335041721386e-06, "loss": 0.7672, "step": 2529 }, { "epoch": 0.6963462464735429, "grad_norm": 0.22598565446047506, "learning_rate": 8.588981015247443e-06, "loss": 0.7847, "step": 2530 }, { "epoch": 0.6966214821440859, "grad_norm": 0.22701575166779644, "learning_rate": 8.57463569810415e-06, "loss": 0.7649, "step": 2531 }, { "epoch": 0.6968967178146288, "grad_norm": 0.22044350576235772, "learning_rate": 8.560299101240436e-06, "loss": 0.7673, "step": 2532 }, { "epoch": 0.6971719534851717, "grad_norm": 0.215508752325192, "learning_rate": 8.545971235598524e-06, "loss": 0.7686, "step": 2533 }, { "epoch": 0.6974471891557146, "grad_norm": 0.22641735357232648, "learning_rate": 8.531652112114011e-06, "loss": 0.7628, "step": 2534 }, { "epoch": 0.6977224248262575, "grad_norm": 0.23307235794961992, "learning_rate": 8.517341741715787e-06, "loss": 0.7756, "step": 2535 }, { "epoch": 0.6979976604968003, "grad_norm": 0.21664296251972612, "learning_rate": 8.503040135326088e-06, "loss": 0.7779, "step": 2536 }, { "epoch": 0.6982728961673433, "grad_norm": 0.22873303552461818, "learning_rate": 8.488747303860463e-06, "loss": 0.7883, "step": 2537 }, { "epoch": 0.6985481318378862, "grad_norm": 0.2272053264983479, "learning_rate": 8.474463258227727e-06, "loss": 0.7853, "step": 2538 }, { "epoch": 0.6988233675084291, "grad_norm": 0.20952576876841586, "learning_rate": 8.460188009330049e-06, "loss": 0.7664, "step": 2539 }, { "epoch": 0.699098603178972, "grad_norm": 0.23100765351637462, "learning_rate": 8.445921568062826e-06, "loss": 0.774, "step": 2540 }, { "epoch": 0.6993738388495149, "grad_norm": 0.22587364684668013, "learning_rate": 8.431663945314766e-06, "loss": 0.7656, "step": 2541 }, { "epoch": 0.6996490745200578, "grad_norm": 0.22018981683186592, "learning_rate": 8.417415151967842e-06, "loss": 0.7827, "step": 2542 }, { "epoch": 0.6999243101906008, "grad_norm": 0.23146857047713387, "learning_rate": 8.403175198897276e-06, "loss": 0.7704, "step": 2543 }, { "epoch": 0.7001995458611436, "grad_norm": 0.22218270447001012, "learning_rate": 8.388944096971556e-06, "loss": 0.7794, "step": 2544 }, { "epoch": 0.7004747815316865, "grad_norm": 0.22157050442550313, "learning_rate": 8.374721857052395e-06, "loss": 0.8121, "step": 2545 }, { "epoch": 0.7007500172022294, "grad_norm": 0.22820270674719595, "learning_rate": 8.360508489994781e-06, "loss": 0.7765, "step": 2546 }, { "epoch": 0.7010252528727723, "grad_norm": 0.21783442266235062, "learning_rate": 8.346304006646884e-06, "loss": 0.7874, "step": 2547 }, { "epoch": 0.7013004885433152, "grad_norm": 0.21583686423778445, "learning_rate": 8.33210841785012e-06, "loss": 0.7603, "step": 2548 }, { "epoch": 0.7015757242138582, "grad_norm": 0.21547658169077147, "learning_rate": 8.317921734439122e-06, "loss": 0.7765, "step": 2549 }, { "epoch": 0.701850959884401, "grad_norm": 0.2189600409277528, "learning_rate": 8.3037439672417e-06, "loss": 0.7983, "step": 2550 }, { "epoch": 0.7021261955549439, "grad_norm": 0.23042073193250784, "learning_rate": 8.289575127078877e-06, "loss": 0.7741, "step": 2551 }, { "epoch": 0.7024014312254868, "grad_norm": 0.21489622516586931, "learning_rate": 8.275415224764871e-06, "loss": 0.8043, "step": 2552 }, { "epoch": 0.7026766668960297, "grad_norm": 0.22017222680919535, "learning_rate": 8.261264271107043e-06, "loss": 0.7568, "step": 2553 }, { "epoch": 0.7029519025665726, "grad_norm": 0.21867384731949382, "learning_rate": 8.247122276905976e-06, "loss": 0.7731, "step": 2554 }, { "epoch": 0.7032271382371156, "grad_norm": 0.22381432608871324, "learning_rate": 8.232989252955369e-06, "loss": 0.7767, "step": 2555 }, { "epoch": 0.7035023739076585, "grad_norm": 0.22301434456062752, "learning_rate": 8.2188652100421e-06, "loss": 0.7646, "step": 2556 }, { "epoch": 0.7037776095782013, "grad_norm": 0.22163531837428702, "learning_rate": 8.204750158946173e-06, "loss": 0.7736, "step": 2557 }, { "epoch": 0.7040528452487442, "grad_norm": 0.23481395694357782, "learning_rate": 8.190644110440748e-06, "loss": 0.7832, "step": 2558 }, { "epoch": 0.7043280809192871, "grad_norm": 0.20940601239892792, "learning_rate": 8.176547075292116e-06, "loss": 0.7766, "step": 2559 }, { "epoch": 0.70460331658983, "grad_norm": 0.2209708685015769, "learning_rate": 8.162459064259653e-06, "loss": 0.7971, "step": 2560 }, { "epoch": 0.704878552260373, "grad_norm": 0.21719861143626087, "learning_rate": 8.148380088095904e-06, "loss": 0.7778, "step": 2561 }, { "epoch": 0.7051537879309159, "grad_norm": 0.21693256303005057, "learning_rate": 8.134310157546466e-06, "loss": 0.755, "step": 2562 }, { "epoch": 0.7054290236014588, "grad_norm": 0.21658199171877832, "learning_rate": 8.120249283350061e-06, "loss": 0.7702, "step": 2563 }, { "epoch": 0.7057042592720016, "grad_norm": 0.22693379810642794, "learning_rate": 8.1061974762385e-06, "loss": 0.7756, "step": 2564 }, { "epoch": 0.7059794949425445, "grad_norm": 0.22557625670715278, "learning_rate": 8.09215474693665e-06, "loss": 0.7947, "step": 2565 }, { "epoch": 0.7062547306130874, "grad_norm": 0.3560822885307025, "learning_rate": 8.078121106162475e-06, "loss": 0.7981, "step": 2566 }, { "epoch": 0.7065299662836304, "grad_norm": 0.21982419278393076, "learning_rate": 8.064096564626977e-06, "loss": 0.7747, "step": 2567 }, { "epoch": 0.7068052019541733, "grad_norm": 0.216863233336889, "learning_rate": 8.050081133034247e-06, "loss": 0.789, "step": 2568 }, { "epoch": 0.7070804376247162, "grad_norm": 0.2129641511926811, "learning_rate": 8.036074822081401e-06, "loss": 0.7775, "step": 2569 }, { "epoch": 0.7073556732952591, "grad_norm": 0.2185402896419739, "learning_rate": 8.022077642458588e-06, "loss": 0.7856, "step": 2570 }, { "epoch": 0.7076309089658019, "grad_norm": 0.20996769819736522, "learning_rate": 8.008089604849008e-06, "loss": 0.7365, "step": 2571 }, { "epoch": 0.7079061446363448, "grad_norm": 0.21688214146662685, "learning_rate": 7.994110719928856e-06, "loss": 0.7757, "step": 2572 }, { "epoch": 0.7081813803068878, "grad_norm": 0.2185082099924713, "learning_rate": 7.980140998367365e-06, "loss": 0.7599, "step": 2573 }, { "epoch": 0.7084566159774307, "grad_norm": 0.2336201524887943, "learning_rate": 7.966180450826768e-06, "loss": 0.8186, "step": 2574 }, { "epoch": 0.7087318516479736, "grad_norm": 0.21981919163309177, "learning_rate": 7.952229087962296e-06, "loss": 0.7776, "step": 2575 }, { "epoch": 0.7090070873185165, "grad_norm": 0.21652149541995094, "learning_rate": 7.938286920422169e-06, "loss": 0.7644, "step": 2576 }, { "epoch": 0.7092823229890594, "grad_norm": 0.23425169143460922, "learning_rate": 7.92435395884758e-06, "loss": 0.7653, "step": 2577 }, { "epoch": 0.7095575586596022, "grad_norm": 0.21985760553119063, "learning_rate": 7.910430213872709e-06, "loss": 0.7609, "step": 2578 }, { "epoch": 0.7098327943301452, "grad_norm": 0.22588238810554612, "learning_rate": 7.896515696124703e-06, "loss": 0.7726, "step": 2579 }, { "epoch": 0.7101080300006881, "grad_norm": 0.23218861287312292, "learning_rate": 7.882610416223644e-06, "loss": 0.8013, "step": 2580 }, { "epoch": 0.710383265671231, "grad_norm": 0.22362351695436455, "learning_rate": 7.868714384782588e-06, "loss": 0.7775, "step": 2581 }, { "epoch": 0.7106585013417739, "grad_norm": 0.24388419406285858, "learning_rate": 7.854827612407521e-06, "loss": 0.797, "step": 2582 }, { "epoch": 0.7109337370123168, "grad_norm": 0.21752661884274282, "learning_rate": 7.840950109697373e-06, "loss": 0.7888, "step": 2583 }, { "epoch": 0.7112089726828597, "grad_norm": 0.23559152832695637, "learning_rate": 7.82708188724398e-06, "loss": 0.7741, "step": 2584 }, { "epoch": 0.7114842083534026, "grad_norm": 0.21694960124158888, "learning_rate": 7.813222955632107e-06, "loss": 0.7652, "step": 2585 }, { "epoch": 0.7117594440239455, "grad_norm": 0.21834541915733874, "learning_rate": 7.799373325439435e-06, "loss": 0.7905, "step": 2586 }, { "epoch": 0.7120346796944884, "grad_norm": 0.21797658290212968, "learning_rate": 7.785533007236521e-06, "loss": 0.7688, "step": 2587 }, { "epoch": 0.7123099153650313, "grad_norm": 0.21881153505452441, "learning_rate": 7.77170201158684e-06, "loss": 0.7949, "step": 2588 }, { "epoch": 0.7125851510355742, "grad_norm": 0.21258110515309403, "learning_rate": 7.757880349046742e-06, "loss": 0.7845, "step": 2589 }, { "epoch": 0.7128603867061171, "grad_norm": 0.25572637344952137, "learning_rate": 7.744068030165454e-06, "loss": 0.7618, "step": 2590 }, { "epoch": 0.7131356223766601, "grad_norm": 0.21293292230523622, "learning_rate": 7.730265065485082e-06, "loss": 0.8043, "step": 2591 }, { "epoch": 0.713410858047203, "grad_norm": 0.23308784980622776, "learning_rate": 7.71647146554056e-06, "loss": 0.7771, "step": 2592 }, { "epoch": 0.7136860937177458, "grad_norm": 0.23235681475884892, "learning_rate": 7.702687240859717e-06, "loss": 0.7834, "step": 2593 }, { "epoch": 0.7139613293882887, "grad_norm": 0.22205098937173648, "learning_rate": 7.68891240196319e-06, "loss": 0.758, "step": 2594 }, { "epoch": 0.7142365650588316, "grad_norm": 0.23388667670185762, "learning_rate": 7.675146959364473e-06, "loss": 0.7623, "step": 2595 }, { "epoch": 0.7145118007293745, "grad_norm": 0.21123479306711065, "learning_rate": 7.661390923569889e-06, "loss": 0.7607, "step": 2596 }, { "epoch": 0.7147870363999175, "grad_norm": 0.4441814421607099, "learning_rate": 7.647644305078572e-06, "loss": 0.7899, "step": 2597 }, { "epoch": 0.7150622720704604, "grad_norm": 0.22675347109781566, "learning_rate": 7.63390711438248e-06, "loss": 0.7615, "step": 2598 }, { "epoch": 0.7153375077410032, "grad_norm": 0.23992857226961903, "learning_rate": 7.620179361966356e-06, "loss": 0.7916, "step": 2599 }, { "epoch": 0.7156127434115461, "grad_norm": 0.24172365626282546, "learning_rate": 7.606461058307755e-06, "loss": 0.7608, "step": 2600 }, { "epoch": 0.715887979082089, "grad_norm": 0.24581114930095574, "learning_rate": 7.592752213877026e-06, "loss": 0.7643, "step": 2601 }, { "epoch": 0.7161632147526319, "grad_norm": 0.23243543587662152, "learning_rate": 7.579052839137273e-06, "loss": 0.7975, "step": 2602 }, { "epoch": 0.7164384504231749, "grad_norm": 0.22742501150177027, "learning_rate": 7.565362944544396e-06, "loss": 0.7565, "step": 2603 }, { "epoch": 0.7167136860937178, "grad_norm": 0.20860500190427597, "learning_rate": 7.551682540547054e-06, "loss": 0.7661, "step": 2604 }, { "epoch": 0.7169889217642607, "grad_norm": 0.22148520453669318, "learning_rate": 7.538011637586658e-06, "loss": 0.7691, "step": 2605 }, { "epoch": 0.7172641574348035, "grad_norm": 0.22797264889547875, "learning_rate": 7.524350246097374e-06, "loss": 0.7616, "step": 2606 }, { "epoch": 0.7175393931053464, "grad_norm": 0.2130472130988018, "learning_rate": 7.510698376506091e-06, "loss": 0.7753, "step": 2607 }, { "epoch": 0.7178146287758893, "grad_norm": 0.4091533442654354, "learning_rate": 7.497056039232462e-06, "loss": 0.7764, "step": 2608 }, { "epoch": 0.7180898644464323, "grad_norm": 0.23280487333957706, "learning_rate": 7.483423244688828e-06, "loss": 0.8078, "step": 2609 }, { "epoch": 0.7183651001169752, "grad_norm": 0.21388497928006925, "learning_rate": 7.46980000328027e-06, "loss": 0.765, "step": 2610 }, { "epoch": 0.7186403357875181, "grad_norm": 0.23504724160770063, "learning_rate": 7.456186325404575e-06, "loss": 0.7808, "step": 2611 }, { "epoch": 0.718915571458061, "grad_norm": 0.23494718344875026, "learning_rate": 7.44258222145223e-06, "loss": 0.7801, "step": 2612 }, { "epoch": 0.7191908071286038, "grad_norm": 0.22478128339705314, "learning_rate": 7.428987701806416e-06, "loss": 0.774, "step": 2613 }, { "epoch": 0.7194660427991467, "grad_norm": 0.22527016656773594, "learning_rate": 7.415402776842982e-06, "loss": 0.7782, "step": 2614 }, { "epoch": 0.7197412784696897, "grad_norm": 0.2248601002155795, "learning_rate": 7.401827456930477e-06, "loss": 0.7948, "step": 2615 }, { "epoch": 0.7200165141402326, "grad_norm": 0.257821893062983, "learning_rate": 7.388261752430115e-06, "loss": 0.7868, "step": 2616 }, { "epoch": 0.7202917498107755, "grad_norm": 0.2141776789864948, "learning_rate": 7.374705673695748e-06, "loss": 0.8008, "step": 2617 }, { "epoch": 0.7205669854813184, "grad_norm": 0.22378162305974467, "learning_rate": 7.361159231073922e-06, "loss": 0.7841, "step": 2618 }, { "epoch": 0.7208422211518613, "grad_norm": 0.21435602613738422, "learning_rate": 7.347622434903787e-06, "loss": 0.7785, "step": 2619 }, { "epoch": 0.7211174568224041, "grad_norm": 0.22718048855111325, "learning_rate": 7.3340952955171655e-06, "loss": 0.7843, "step": 2620 }, { "epoch": 0.7213926924929471, "grad_norm": 0.22841310724341327, "learning_rate": 7.320577823238475e-06, "loss": 0.7725, "step": 2621 }, { "epoch": 0.72166792816349, "grad_norm": 0.21325490490438734, "learning_rate": 7.307070028384782e-06, "loss": 0.7895, "step": 2622 }, { "epoch": 0.7219431638340329, "grad_norm": 0.2258875597667349, "learning_rate": 7.293571921265765e-06, "loss": 0.7666, "step": 2623 }, { "epoch": 0.7222183995045758, "grad_norm": 0.21190356615671044, "learning_rate": 7.280083512183678e-06, "loss": 0.7633, "step": 2624 }, { "epoch": 0.7224936351751187, "grad_norm": 0.2231753865614009, "learning_rate": 7.266604811433424e-06, "loss": 0.7469, "step": 2625 }, { "epoch": 0.7227688708456615, "grad_norm": 0.22143692791586356, "learning_rate": 7.253135829302451e-06, "loss": 0.7748, "step": 2626 }, { "epoch": 0.7230441065162045, "grad_norm": 0.21333224666052628, "learning_rate": 7.239676576070809e-06, "loss": 0.7818, "step": 2627 }, { "epoch": 0.7233193421867474, "grad_norm": 0.2187465656916614, "learning_rate": 7.2262270620111305e-06, "loss": 0.7926, "step": 2628 }, { "epoch": 0.7235945778572903, "grad_norm": 0.21542351593374082, "learning_rate": 7.212787297388588e-06, "loss": 0.8123, "step": 2629 }, { "epoch": 0.7238698135278332, "grad_norm": 0.2182686100645093, "learning_rate": 7.199357292460945e-06, "loss": 0.7958, "step": 2630 }, { "epoch": 0.7241450491983761, "grad_norm": 0.22336430210451583, "learning_rate": 7.185937057478478e-06, "loss": 0.7758, "step": 2631 }, { "epoch": 0.724420284868919, "grad_norm": 0.21283687484459596, "learning_rate": 7.172526602684058e-06, "loss": 0.7828, "step": 2632 }, { "epoch": 0.724695520539462, "grad_norm": 0.21296296324730565, "learning_rate": 7.159125938313041e-06, "loss": 0.78, "step": 2633 }, { "epoch": 0.7249707562100048, "grad_norm": 0.2235125890136319, "learning_rate": 7.145735074593338e-06, "loss": 0.8013, "step": 2634 }, { "epoch": 0.7252459918805477, "grad_norm": 0.22569966450039847, "learning_rate": 7.132354021745383e-06, "loss": 0.8054, "step": 2635 }, { "epoch": 0.7255212275510906, "grad_norm": 0.22299176954010355, "learning_rate": 7.118982789982096e-06, "loss": 0.7813, "step": 2636 }, { "epoch": 0.7257964632216335, "grad_norm": 0.21174516512179112, "learning_rate": 7.105621389508925e-06, "loss": 0.7489, "step": 2637 }, { "epoch": 0.7260716988921764, "grad_norm": 0.2254521324842919, "learning_rate": 7.09226983052381e-06, "loss": 0.7875, "step": 2638 }, { "epoch": 0.7263469345627194, "grad_norm": 0.22056007534564895, "learning_rate": 7.078928123217175e-06, "loss": 0.7938, "step": 2639 }, { "epoch": 0.7266221702332623, "grad_norm": 0.2199929946297742, "learning_rate": 7.065596277771931e-06, "loss": 0.7815, "step": 2640 }, { "epoch": 0.7268974059038051, "grad_norm": 0.20472138887987787, "learning_rate": 7.052274304363449e-06, "loss": 0.7776, "step": 2641 }, { "epoch": 0.727172641574348, "grad_norm": 0.21545903753551834, "learning_rate": 7.0389622131595835e-06, "loss": 0.7738, "step": 2642 }, { "epoch": 0.7274478772448909, "grad_norm": 0.21142960653804516, "learning_rate": 7.0256600143206235e-06, "loss": 0.7856, "step": 2643 }, { "epoch": 0.7277231129154338, "grad_norm": 0.21948079817216246, "learning_rate": 7.012367717999331e-06, "loss": 0.7899, "step": 2644 }, { "epoch": 0.7279983485859768, "grad_norm": 0.2043438503379916, "learning_rate": 6.9990853343408986e-06, "loss": 0.7756, "step": 2645 }, { "epoch": 0.7282735842565197, "grad_norm": 0.20985830968379818, "learning_rate": 6.985812873482953e-06, "loss": 0.7988, "step": 2646 }, { "epoch": 0.7285488199270626, "grad_norm": 0.2243795238123144, "learning_rate": 6.97255034555556e-06, "loss": 0.7971, "step": 2647 }, { "epoch": 0.7288240555976054, "grad_norm": 0.2046682781819276, "learning_rate": 6.959297760681176e-06, "loss": 0.7856, "step": 2648 }, { "epoch": 0.7290992912681483, "grad_norm": 0.21705682375699856, "learning_rate": 6.946055128974694e-06, "loss": 0.7979, "step": 2649 }, { "epoch": 0.7293745269386912, "grad_norm": 0.23901909549553974, "learning_rate": 6.932822460543409e-06, "loss": 0.7705, "step": 2650 }, { "epoch": 0.7296497626092342, "grad_norm": 0.5511118712416953, "learning_rate": 6.919599765486993e-06, "loss": 0.7994, "step": 2651 }, { "epoch": 0.7299249982797771, "grad_norm": 0.20488173065189808, "learning_rate": 6.906387053897523e-06, "loss": 0.7696, "step": 2652 }, { "epoch": 0.73020023395032, "grad_norm": 0.22057455829477815, "learning_rate": 6.89318433585945e-06, "loss": 0.7959, "step": 2653 }, { "epoch": 0.7304754696208628, "grad_norm": 0.2055333890282667, "learning_rate": 6.879991621449602e-06, "loss": 0.7684, "step": 2654 }, { "epoch": 0.7307507052914057, "grad_norm": 0.2111660825636503, "learning_rate": 6.866808920737174e-06, "loss": 0.73, "step": 2655 }, { "epoch": 0.7310259409619486, "grad_norm": 0.2188215699005884, "learning_rate": 6.853636243783697e-06, "loss": 0.7733, "step": 2656 }, { "epoch": 0.7313011766324916, "grad_norm": 0.2145065275849248, "learning_rate": 6.840473600643081e-06, "loss": 0.8002, "step": 2657 }, { "epoch": 0.7315764123030345, "grad_norm": 0.23142482708949125, "learning_rate": 6.8273210013615536e-06, "loss": 0.7817, "step": 2658 }, { "epoch": 0.7318516479735774, "grad_norm": 0.20594948595110116, "learning_rate": 6.814178455977689e-06, "loss": 0.8007, "step": 2659 }, { "epoch": 0.7321268836441203, "grad_norm": 0.21349424936460418, "learning_rate": 6.801045974522389e-06, "loss": 0.7615, "step": 2660 }, { "epoch": 0.7324021193146631, "grad_norm": 0.21866381173181135, "learning_rate": 6.7879235670188705e-06, "loss": 0.7709, "step": 2661 }, { "epoch": 0.732677354985206, "grad_norm": 0.21029288711314637, "learning_rate": 6.774811243482667e-06, "loss": 0.7628, "step": 2662 }, { "epoch": 0.732952590655749, "grad_norm": 0.2209747070892804, "learning_rate": 6.7617090139216e-06, "loss": 0.7752, "step": 2663 }, { "epoch": 0.7332278263262919, "grad_norm": 0.23083516086026892, "learning_rate": 6.7486168883358015e-06, "loss": 0.7897, "step": 2664 }, { "epoch": 0.7335030619968348, "grad_norm": 0.2112523377093742, "learning_rate": 6.735534876717695e-06, "loss": 0.7815, "step": 2665 }, { "epoch": 0.7337782976673777, "grad_norm": 0.20947016260086723, "learning_rate": 6.722462989051965e-06, "loss": 0.788, "step": 2666 }, { "epoch": 0.7340535333379206, "grad_norm": 0.22127867347340469, "learning_rate": 6.709401235315587e-06, "loss": 0.7916, "step": 2667 }, { "epoch": 0.7343287690084634, "grad_norm": 0.2113364980957063, "learning_rate": 6.696349625477798e-06, "loss": 0.7914, "step": 2668 }, { "epoch": 0.7346040046790064, "grad_norm": 0.21443903627483418, "learning_rate": 6.683308169500094e-06, "loss": 0.7866, "step": 2669 }, { "epoch": 0.7348792403495493, "grad_norm": 0.22351041775050992, "learning_rate": 6.670276877336208e-06, "loss": 0.7639, "step": 2670 }, { "epoch": 0.7351544760200922, "grad_norm": 0.21343161756436144, "learning_rate": 6.657255758932133e-06, "loss": 0.7593, "step": 2671 }, { "epoch": 0.7354297116906351, "grad_norm": 0.206875012944444, "learning_rate": 6.644244824226094e-06, "loss": 0.7784, "step": 2672 }, { "epoch": 0.735704947361178, "grad_norm": 0.2163725621134461, "learning_rate": 6.631244083148525e-06, "loss": 0.7744, "step": 2673 }, { "epoch": 0.7359801830317209, "grad_norm": 0.22142145880785594, "learning_rate": 6.618253545622104e-06, "loss": 0.7521, "step": 2674 }, { "epoch": 0.7362554187022639, "grad_norm": 0.20227419570146793, "learning_rate": 6.60527322156171e-06, "loss": 0.7424, "step": 2675 }, { "epoch": 0.7365306543728067, "grad_norm": 0.21530029313418675, "learning_rate": 6.592303120874428e-06, "loss": 0.7774, "step": 2676 }, { "epoch": 0.7368058900433496, "grad_norm": 0.20766952535418937, "learning_rate": 6.579343253459545e-06, "loss": 0.7824, "step": 2677 }, { "epoch": 0.7370811257138925, "grad_norm": 0.209642890188279, "learning_rate": 6.566393629208523e-06, "loss": 0.7753, "step": 2678 }, { "epoch": 0.7373563613844354, "grad_norm": 0.2140088731249423, "learning_rate": 6.553454258005025e-06, "loss": 0.7922, "step": 2679 }, { "epoch": 0.7376315970549783, "grad_norm": 0.20476957127418594, "learning_rate": 6.540525149724868e-06, "loss": 0.7764, "step": 2680 }, { "epoch": 0.7379068327255213, "grad_norm": 0.2180808817119653, "learning_rate": 6.527606314236053e-06, "loss": 0.8113, "step": 2681 }, { "epoch": 0.7381820683960642, "grad_norm": 0.1998712377413449, "learning_rate": 6.514697761398734e-06, "loss": 0.7628, "step": 2682 }, { "epoch": 0.738457304066607, "grad_norm": 0.22040649973499035, "learning_rate": 6.501799501065218e-06, "loss": 0.7783, "step": 2683 }, { "epoch": 0.7387325397371499, "grad_norm": 0.2106578922619492, "learning_rate": 6.488911543079963e-06, "loss": 0.7874, "step": 2684 }, { "epoch": 0.7390077754076928, "grad_norm": 0.20820361702320744, "learning_rate": 6.476033897279544e-06, "loss": 0.763, "step": 2685 }, { "epoch": 0.7392830110782357, "grad_norm": 0.20952645664031386, "learning_rate": 6.463166573492683e-06, "loss": 0.7884, "step": 2686 }, { "epoch": 0.7395582467487787, "grad_norm": 0.21486499686804741, "learning_rate": 6.450309581540224e-06, "loss": 0.7806, "step": 2687 }, { "epoch": 0.7398334824193216, "grad_norm": 0.21459816829548498, "learning_rate": 6.437462931235103e-06, "loss": 0.7614, "step": 2688 }, { "epoch": 0.7401087180898644, "grad_norm": 0.21430064000588245, "learning_rate": 6.424626632382407e-06, "loss": 0.7608, "step": 2689 }, { "epoch": 0.7403839537604073, "grad_norm": 0.21700886976937256, "learning_rate": 6.411800694779271e-06, "loss": 0.791, "step": 2690 }, { "epoch": 0.7406591894309502, "grad_norm": 0.22130148583431022, "learning_rate": 6.398985128214959e-06, "loss": 0.7775, "step": 2691 }, { "epoch": 0.7409344251014931, "grad_norm": 0.20982250779474793, "learning_rate": 6.386179942470807e-06, "loss": 0.7706, "step": 2692 }, { "epoch": 0.7412096607720361, "grad_norm": 0.20401306529238422, "learning_rate": 6.373385147320219e-06, "loss": 0.7541, "step": 2693 }, { "epoch": 0.741484896442579, "grad_norm": 0.2195471330562807, "learning_rate": 6.360600752528689e-06, "loss": 0.7777, "step": 2694 }, { "epoch": 0.7417601321131219, "grad_norm": 0.2052895874422415, "learning_rate": 6.3478267678537396e-06, "loss": 0.7725, "step": 2695 }, { "epoch": 0.7420353677836647, "grad_norm": 0.20624667047981138, "learning_rate": 6.335063203045e-06, "loss": 0.7827, "step": 2696 }, { "epoch": 0.7423106034542076, "grad_norm": 0.20785169992857394, "learning_rate": 6.322310067844091e-06, "loss": 0.7903, "step": 2697 }, { "epoch": 0.7425858391247505, "grad_norm": 0.21614247749932902, "learning_rate": 6.3095673719847106e-06, "loss": 0.7879, "step": 2698 }, { "epoch": 0.7428610747952935, "grad_norm": 0.21383956902640192, "learning_rate": 6.296835125192578e-06, "loss": 0.7555, "step": 2699 }, { "epoch": 0.7431363104658364, "grad_norm": 0.20877126594816658, "learning_rate": 6.284113337185425e-06, "loss": 0.7712, "step": 2700 }, { "epoch": 0.7434115461363793, "grad_norm": 0.2115766812806403, "learning_rate": 6.271402017673021e-06, "loss": 0.7786, "step": 2701 }, { "epoch": 0.7436867818069222, "grad_norm": 0.20546818203490577, "learning_rate": 6.258701176357132e-06, "loss": 0.8017, "step": 2702 }, { "epoch": 0.743962017477465, "grad_norm": 0.21199643879353702, "learning_rate": 6.246010822931532e-06, "loss": 0.7674, "step": 2703 }, { "epoch": 0.7442372531480079, "grad_norm": 0.21475894347401708, "learning_rate": 6.2333309670819965e-06, "loss": 0.7586, "step": 2704 }, { "epoch": 0.7445124888185509, "grad_norm": 0.21509543984093626, "learning_rate": 6.220661618486268e-06, "loss": 0.7701, "step": 2705 }, { "epoch": 0.7447877244890938, "grad_norm": 0.20893566086832982, "learning_rate": 6.208002786814098e-06, "loss": 0.7659, "step": 2706 }, { "epoch": 0.7450629601596367, "grad_norm": 0.2001635192012533, "learning_rate": 6.195354481727181e-06, "loss": 0.7678, "step": 2707 }, { "epoch": 0.7453381958301796, "grad_norm": 0.20883481520896027, "learning_rate": 6.182716712879198e-06, "loss": 0.761, "step": 2708 }, { "epoch": 0.7456134315007225, "grad_norm": 0.2084136942059921, "learning_rate": 6.170089489915792e-06, "loss": 0.7845, "step": 2709 }, { "epoch": 0.7458886671712655, "grad_norm": 0.20534337376394513, "learning_rate": 6.157472822474524e-06, "loss": 0.7601, "step": 2710 }, { "epoch": 0.7461639028418083, "grad_norm": 0.20694835285974103, "learning_rate": 6.144866720184952e-06, "loss": 0.7758, "step": 2711 }, { "epoch": 0.7464391385123512, "grad_norm": 0.2129128193705512, "learning_rate": 6.132271192668518e-06, "loss": 0.7822, "step": 2712 }, { "epoch": 0.7467143741828941, "grad_norm": 0.20227471703214245, "learning_rate": 6.119686249538624e-06, "loss": 0.8066, "step": 2713 }, { "epoch": 0.746989609853437, "grad_norm": 0.209815586879814, "learning_rate": 6.107111900400589e-06, "loss": 0.7641, "step": 2714 }, { "epoch": 0.7472648455239799, "grad_norm": 0.21229486837207356, "learning_rate": 6.094548154851631e-06, "loss": 0.7967, "step": 2715 }, { "epoch": 0.7475400811945229, "grad_norm": 0.20809716226906377, "learning_rate": 6.0819950224809024e-06, "loss": 0.7831, "step": 2716 }, { "epoch": 0.7478153168650657, "grad_norm": 0.21016620242804573, "learning_rate": 6.069452512869411e-06, "loss": 0.7676, "step": 2717 }, { "epoch": 0.7480905525356086, "grad_norm": 0.20990730360216506, "learning_rate": 6.05692063559012e-06, "loss": 0.7694, "step": 2718 }, { "epoch": 0.7483657882061515, "grad_norm": 0.19635188064107884, "learning_rate": 6.044399400207817e-06, "loss": 0.7628, "step": 2719 }, { "epoch": 0.7486410238766944, "grad_norm": 0.21057427708628593, "learning_rate": 6.031888816279199e-06, "loss": 0.7869, "step": 2720 }, { "epoch": 0.7489162595472373, "grad_norm": 0.20003959979288685, "learning_rate": 6.019388893352838e-06, "loss": 0.7362, "step": 2721 }, { "epoch": 0.7491914952177803, "grad_norm": 0.20147588359078802, "learning_rate": 6.006899640969142e-06, "loss": 0.7621, "step": 2722 }, { "epoch": 0.7494667308883232, "grad_norm": 0.21665705345996358, "learning_rate": 5.994421068660396e-06, "loss": 0.7796, "step": 2723 }, { "epoch": 0.749741966558866, "grad_norm": 0.212541364525579, "learning_rate": 5.981953185950735e-06, "loss": 0.7539, "step": 2724 }, { "epoch": 0.7500172022294089, "grad_norm": 0.2031858588635556, "learning_rate": 5.969496002356121e-06, "loss": 0.7842, "step": 2725 }, { "epoch": 0.7502924378999518, "grad_norm": 0.20295736464712008, "learning_rate": 5.9570495273843705e-06, "loss": 0.7579, "step": 2726 }, { "epoch": 0.7505676735704947, "grad_norm": 0.21978493607175753, "learning_rate": 5.944613770535099e-06, "loss": 0.7839, "step": 2727 }, { "epoch": 0.7508429092410377, "grad_norm": 0.196979851348305, "learning_rate": 5.9321887412997695e-06, "loss": 0.7824, "step": 2728 }, { "epoch": 0.7511181449115806, "grad_norm": 0.20890745311280653, "learning_rate": 5.91977444916163e-06, "loss": 0.7364, "step": 2729 }, { "epoch": 0.7513933805821235, "grad_norm": 0.2172243799655082, "learning_rate": 5.907370903595757e-06, "loss": 0.7797, "step": 2730 }, { "epoch": 0.7516686162526663, "grad_norm": 0.20076579224891278, "learning_rate": 5.8949781140690166e-06, "loss": 0.7674, "step": 2731 }, { "epoch": 0.7519438519232092, "grad_norm": 0.20004532117901183, "learning_rate": 5.882596090040061e-06, "loss": 0.7473, "step": 2732 }, { "epoch": 0.7522190875937521, "grad_norm": 0.21370352536628204, "learning_rate": 5.87022484095934e-06, "loss": 0.7812, "step": 2733 }, { "epoch": 0.7524943232642951, "grad_norm": 0.20146215525752867, "learning_rate": 5.857864376269051e-06, "loss": 0.7721, "step": 2734 }, { "epoch": 0.752769558934838, "grad_norm": 0.20877758112932118, "learning_rate": 5.84551470540319e-06, "loss": 0.8085, "step": 2735 }, { "epoch": 0.7530447946053809, "grad_norm": 0.21114667619502187, "learning_rate": 5.833175837787506e-06, "loss": 0.7746, "step": 2736 }, { "epoch": 0.7533200302759238, "grad_norm": 0.21414604914230712, "learning_rate": 5.820847782839489e-06, "loss": 0.7854, "step": 2737 }, { "epoch": 0.7535952659464666, "grad_norm": 0.1981076499949966, "learning_rate": 5.808530549968392e-06, "loss": 0.7545, "step": 2738 }, { "epoch": 0.7538705016170095, "grad_norm": 0.21137767766334561, "learning_rate": 5.796224148575203e-06, "loss": 0.7645, "step": 2739 }, { "epoch": 0.7541457372875525, "grad_norm": 0.21277508850489377, "learning_rate": 5.783928588052643e-06, "loss": 0.7659, "step": 2740 }, { "epoch": 0.7544209729580954, "grad_norm": 0.21471107041164525, "learning_rate": 5.771643877785167e-06, "loss": 0.7639, "step": 2741 }, { "epoch": 0.7546962086286383, "grad_norm": 0.21141065538197631, "learning_rate": 5.759370027148925e-06, "loss": 0.7552, "step": 2742 }, { "epoch": 0.7549714442991812, "grad_norm": 0.20970651925923653, "learning_rate": 5.747107045511811e-06, "loss": 0.7623, "step": 2743 }, { "epoch": 0.755246679969724, "grad_norm": 0.216780701781582, "learning_rate": 5.73485494223339e-06, "loss": 0.7896, "step": 2744 }, { "epoch": 0.7555219156402669, "grad_norm": 0.21359290287749802, "learning_rate": 5.72261372666495e-06, "loss": 0.7625, "step": 2745 }, { "epoch": 0.7557971513108099, "grad_norm": 0.19829604785715904, "learning_rate": 5.710383408149456e-06, "loss": 0.7759, "step": 2746 }, { "epoch": 0.7560723869813528, "grad_norm": 0.21472421287741664, "learning_rate": 5.698163996021564e-06, "loss": 0.8087, "step": 2747 }, { "epoch": 0.7563476226518957, "grad_norm": 0.20762047223825586, "learning_rate": 5.685955499607605e-06, "loss": 0.7726, "step": 2748 }, { "epoch": 0.7566228583224386, "grad_norm": 0.19779265094083542, "learning_rate": 5.673757928225563e-06, "loss": 0.7658, "step": 2749 }, { "epoch": 0.7568980939929815, "grad_norm": 0.20656416297964883, "learning_rate": 5.6615712911851016e-06, "loss": 0.7932, "step": 2750 }, { "epoch": 0.7571733296635244, "grad_norm": 0.31567168953732694, "learning_rate": 5.649395597787544e-06, "loss": 0.7724, "step": 2751 }, { "epoch": 0.7574485653340673, "grad_norm": 0.2018029260095364, "learning_rate": 5.6372308573258235e-06, "loss": 0.772, "step": 2752 }, { "epoch": 0.7577238010046102, "grad_norm": 0.20563668646675162, "learning_rate": 5.625077079084571e-06, "loss": 0.7657, "step": 2753 }, { "epoch": 0.7579990366751531, "grad_norm": 0.21391863965197877, "learning_rate": 5.612934272340001e-06, "loss": 0.7785, "step": 2754 }, { "epoch": 0.758274272345696, "grad_norm": 0.2073258224296366, "learning_rate": 5.600802446359981e-06, "loss": 0.7583, "step": 2755 }, { "epoch": 0.7585495080162389, "grad_norm": 0.2097306152223069, "learning_rate": 5.588681610403978e-06, "loss": 0.7875, "step": 2756 }, { "epoch": 0.7588247436867818, "grad_norm": 0.20085077935674445, "learning_rate": 5.576571773723094e-06, "loss": 0.7572, "step": 2757 }, { "epoch": 0.7590999793573248, "grad_norm": 0.21460856643656978, "learning_rate": 5.5644729455600246e-06, "loss": 0.7873, "step": 2758 }, { "epoch": 0.7593752150278676, "grad_norm": 0.21354432137006993, "learning_rate": 5.552385135149048e-06, "loss": 0.769, "step": 2759 }, { "epoch": 0.7596504506984105, "grad_norm": 0.20477988532515246, "learning_rate": 5.5403083517160686e-06, "loss": 0.7844, "step": 2760 }, { "epoch": 0.7599256863689534, "grad_norm": 0.20760878856343412, "learning_rate": 5.5282426044785396e-06, "loss": 0.765, "step": 2761 }, { "epoch": 0.7602009220394963, "grad_norm": 0.21180288595410768, "learning_rate": 5.516187902645511e-06, "loss": 0.7427, "step": 2762 }, { "epoch": 0.7604761577100392, "grad_norm": 0.21179482853742132, "learning_rate": 5.504144255417605e-06, "loss": 0.7859, "step": 2763 }, { "epoch": 0.7607513933805822, "grad_norm": 0.20430100175741778, "learning_rate": 5.492111671986981e-06, "loss": 0.7817, "step": 2764 }, { "epoch": 0.7610266290511251, "grad_norm": 0.20493524906648022, "learning_rate": 5.480090161537388e-06, "loss": 0.7757, "step": 2765 }, { "epoch": 0.7613018647216679, "grad_norm": 0.2062615698229132, "learning_rate": 5.468079733244096e-06, "loss": 0.7554, "step": 2766 }, { "epoch": 0.7615771003922108, "grad_norm": 0.20852437361425066, "learning_rate": 5.45608039627393e-06, "loss": 0.8011, "step": 2767 }, { "epoch": 0.7618523360627537, "grad_norm": 0.19612811778492137, "learning_rate": 5.444092159785252e-06, "loss": 0.8036, "step": 2768 }, { "epoch": 0.7621275717332966, "grad_norm": 0.20814841495296343, "learning_rate": 5.4321150329279444e-06, "loss": 0.7653, "step": 2769 }, { "epoch": 0.7624028074038396, "grad_norm": 0.2067981257009054, "learning_rate": 5.420149024843422e-06, "loss": 0.7601, "step": 2770 }, { "epoch": 0.7626780430743825, "grad_norm": 0.19340169480293778, "learning_rate": 5.408194144664589e-06, "loss": 0.7786, "step": 2771 }, { "epoch": 0.7629532787449254, "grad_norm": 0.20213918063352884, "learning_rate": 5.396250401515879e-06, "loss": 0.7573, "step": 2772 }, { "epoch": 0.7632285144154682, "grad_norm": 0.36101439044528943, "learning_rate": 5.384317804513226e-06, "loss": 0.7686, "step": 2773 }, { "epoch": 0.7635037500860111, "grad_norm": 0.20121244805137242, "learning_rate": 5.372396362764032e-06, "loss": 0.7482, "step": 2774 }, { "epoch": 0.763778985756554, "grad_norm": 0.21229294525049264, "learning_rate": 5.360486085367223e-06, "loss": 0.7727, "step": 2775 }, { "epoch": 0.764054221427097, "grad_norm": 0.21460844358425726, "learning_rate": 5.348586981413167e-06, "loss": 0.7431, "step": 2776 }, { "epoch": 0.7643294570976399, "grad_norm": 0.20677375955687788, "learning_rate": 5.33669905998373e-06, "loss": 0.766, "step": 2777 }, { "epoch": 0.7646046927681828, "grad_norm": 0.20298605650792526, "learning_rate": 5.324822330152224e-06, "loss": 0.7729, "step": 2778 }, { "epoch": 0.7648799284387257, "grad_norm": 0.21364603158298678, "learning_rate": 5.312956800983431e-06, "loss": 0.7824, "step": 2779 }, { "epoch": 0.7651551641092685, "grad_norm": 0.1980306269101776, "learning_rate": 5.301102481533588e-06, "loss": 0.7663, "step": 2780 }, { "epoch": 0.7654303997798114, "grad_norm": 0.21113212712739332, "learning_rate": 5.289259380850356e-06, "loss": 0.7536, "step": 2781 }, { "epoch": 0.7657056354503544, "grad_norm": 0.22120168726052686, "learning_rate": 5.277427507972865e-06, "loss": 0.8017, "step": 2782 }, { "epoch": 0.7659808711208973, "grad_norm": 0.1991296610849373, "learning_rate": 5.265606871931646e-06, "loss": 0.7809, "step": 2783 }, { "epoch": 0.7662561067914402, "grad_norm": 0.205851615356833, "learning_rate": 5.253797481748664e-06, "loss": 0.728, "step": 2784 }, { "epoch": 0.7665313424619831, "grad_norm": 0.19935705085680255, "learning_rate": 5.241999346437312e-06, "loss": 0.7752, "step": 2785 }, { "epoch": 0.766806578132526, "grad_norm": 0.20025888764342273, "learning_rate": 5.230212475002372e-06, "loss": 0.7748, "step": 2786 }, { "epoch": 0.7670818138030688, "grad_norm": 0.2035110499205305, "learning_rate": 5.218436876440043e-06, "loss": 0.7666, "step": 2787 }, { "epoch": 0.7673570494736118, "grad_norm": 0.19652921691991823, "learning_rate": 5.206672559737918e-06, "loss": 0.7605, "step": 2788 }, { "epoch": 0.7676322851441547, "grad_norm": 0.2000057952894092, "learning_rate": 5.194919533874978e-06, "loss": 0.7761, "step": 2789 }, { "epoch": 0.7679075208146976, "grad_norm": 0.20458672467871827, "learning_rate": 5.1831778078215934e-06, "loss": 0.7969, "step": 2790 }, { "epoch": 0.7681827564852405, "grad_norm": 0.19592344239650988, "learning_rate": 5.17144739053949e-06, "loss": 0.7656, "step": 2791 }, { "epoch": 0.7684579921557834, "grad_norm": 0.20069305957708425, "learning_rate": 5.159728290981789e-06, "loss": 0.7448, "step": 2792 }, { "epoch": 0.7687332278263262, "grad_norm": 0.19691865493062355, "learning_rate": 5.148020518092946e-06, "loss": 0.7464, "step": 2793 }, { "epoch": 0.7690084634968692, "grad_norm": 0.20856646507200627, "learning_rate": 5.136324080808794e-06, "loss": 0.7527, "step": 2794 }, { "epoch": 0.7692836991674121, "grad_norm": 0.20139881996909267, "learning_rate": 5.124638988056505e-06, "loss": 0.7661, "step": 2795 }, { "epoch": 0.769558934837955, "grad_norm": 0.19980265147715442, "learning_rate": 5.112965248754593e-06, "loss": 0.7623, "step": 2796 }, { "epoch": 0.7698341705084979, "grad_norm": 0.2033409444923468, "learning_rate": 5.1013028718129125e-06, "loss": 0.7898, "step": 2797 }, { "epoch": 0.7701094061790408, "grad_norm": 0.2115439104758614, "learning_rate": 5.08965186613263e-06, "loss": 0.7751, "step": 2798 }, { "epoch": 0.7703846418495837, "grad_norm": 0.2014661531570698, "learning_rate": 5.078012240606247e-06, "loss": 0.7648, "step": 2799 }, { "epoch": 0.7706598775201267, "grad_norm": 0.201704932722407, "learning_rate": 5.066384004117584e-06, "loss": 0.7782, "step": 2800 }, { "epoch": 0.7709351131906695, "grad_norm": 0.2664404059344981, "learning_rate": 5.0547671655417475e-06, "loss": 0.7784, "step": 2801 }, { "epoch": 0.7712103488612124, "grad_norm": 0.20034439907179086, "learning_rate": 5.043161733745163e-06, "loss": 0.7673, "step": 2802 }, { "epoch": 0.7714855845317553, "grad_norm": 0.20392548465807855, "learning_rate": 5.031567717585544e-06, "loss": 0.7664, "step": 2803 }, { "epoch": 0.7717608202022982, "grad_norm": 0.20245352307159437, "learning_rate": 5.019985125911899e-06, "loss": 0.7615, "step": 2804 }, { "epoch": 0.7720360558728411, "grad_norm": 0.20197079632256648, "learning_rate": 5.008413967564496e-06, "loss": 0.7762, "step": 2805 }, { "epoch": 0.7723112915433841, "grad_norm": 0.19548801964183182, "learning_rate": 4.996854251374901e-06, "loss": 0.7698, "step": 2806 }, { "epoch": 0.772586527213927, "grad_norm": 0.20770427627275576, "learning_rate": 4.985305986165934e-06, "loss": 0.7576, "step": 2807 }, { "epoch": 0.7728617628844698, "grad_norm": 0.20117380867737134, "learning_rate": 4.973769180751673e-06, "loss": 0.7814, "step": 2808 }, { "epoch": 0.7731369985550127, "grad_norm": 0.20374138518222956, "learning_rate": 4.962243843937455e-06, "loss": 0.7478, "step": 2809 }, { "epoch": 0.7734122342255556, "grad_norm": 0.21250544302689162, "learning_rate": 4.950729984519864e-06, "loss": 0.7753, "step": 2810 }, { "epoch": 0.7736874698960985, "grad_norm": 0.2083913435747433, "learning_rate": 4.939227611286724e-06, "loss": 0.7653, "step": 2811 }, { "epoch": 0.7739627055666415, "grad_norm": 0.23397763953285183, "learning_rate": 4.927736733017092e-06, "loss": 0.7671, "step": 2812 }, { "epoch": 0.7742379412371844, "grad_norm": 0.2072476530387938, "learning_rate": 4.916257358481245e-06, "loss": 0.7971, "step": 2813 }, { "epoch": 0.7745131769077273, "grad_norm": 0.2090302491655131, "learning_rate": 4.904789496440692e-06, "loss": 0.758, "step": 2814 }, { "epoch": 0.7747884125782701, "grad_norm": 0.2198219751580214, "learning_rate": 4.893333155648136e-06, "loss": 0.7874, "step": 2815 }, { "epoch": 0.775063648248813, "grad_norm": 0.20189635971867195, "learning_rate": 4.881888344847512e-06, "loss": 0.7698, "step": 2816 }, { "epoch": 0.7753388839193559, "grad_norm": 0.20717508735363308, "learning_rate": 4.870455072773934e-06, "loss": 0.7793, "step": 2817 }, { "epoch": 0.7756141195898989, "grad_norm": 0.21574722718069833, "learning_rate": 4.859033348153721e-06, "loss": 0.8037, "step": 2818 }, { "epoch": 0.7758893552604418, "grad_norm": 0.20852967761604516, "learning_rate": 4.847623179704379e-06, "loss": 0.7787, "step": 2819 }, { "epoch": 0.7761645909309847, "grad_norm": 0.20298686271435418, "learning_rate": 4.836224576134581e-06, "loss": 0.7673, "step": 2820 }, { "epoch": 0.7764398266015275, "grad_norm": 0.20542244356161593, "learning_rate": 4.824837546144183e-06, "loss": 0.7814, "step": 2821 }, { "epoch": 0.7767150622720704, "grad_norm": 0.20741189143890504, "learning_rate": 4.813462098424213e-06, "loss": 0.7466, "step": 2822 }, { "epoch": 0.7769902979426133, "grad_norm": 0.2196502589426676, "learning_rate": 4.802098241656845e-06, "loss": 0.7874, "step": 2823 }, { "epoch": 0.7772655336131563, "grad_norm": 0.20375783521133, "learning_rate": 4.790745984515415e-06, "loss": 0.7645, "step": 2824 }, { "epoch": 0.7775407692836992, "grad_norm": 0.19447425806545415, "learning_rate": 4.779405335664404e-06, "loss": 0.7414, "step": 2825 }, { "epoch": 0.7778160049542421, "grad_norm": 0.20846971348585894, "learning_rate": 4.7680763037594364e-06, "loss": 0.7748, "step": 2826 }, { "epoch": 0.778091240624785, "grad_norm": 0.20256123680867555, "learning_rate": 4.7567588974472734e-06, "loss": 0.7961, "step": 2827 }, { "epoch": 0.7783664762953278, "grad_norm": 0.19768646062168152, "learning_rate": 4.745453125365782e-06, "loss": 0.774, "step": 2828 }, { "epoch": 0.7786417119658707, "grad_norm": 0.20804450589556203, "learning_rate": 4.734158996143978e-06, "loss": 0.7688, "step": 2829 }, { "epoch": 0.7789169476364137, "grad_norm": 0.20764714967614814, "learning_rate": 4.7228765184019644e-06, "loss": 0.7705, "step": 2830 }, { "epoch": 0.7791921833069566, "grad_norm": 0.2730353583813722, "learning_rate": 4.711605700750972e-06, "loss": 0.7574, "step": 2831 }, { "epoch": 0.7794674189774995, "grad_norm": 0.19959559518050152, "learning_rate": 4.700346551793322e-06, "loss": 0.7662, "step": 2832 }, { "epoch": 0.7797426546480424, "grad_norm": 0.2036881632392109, "learning_rate": 4.689099080122434e-06, "loss": 0.7715, "step": 2833 }, { "epoch": 0.7800178903185853, "grad_norm": 0.2088481445737849, "learning_rate": 4.67786329432282e-06, "loss": 0.7939, "step": 2834 }, { "epoch": 0.7802931259891281, "grad_norm": 0.20050090525126577, "learning_rate": 4.666639202970049e-06, "loss": 0.7752, "step": 2835 }, { "epoch": 0.7805683616596711, "grad_norm": 0.20564022972980098, "learning_rate": 4.655426814630793e-06, "loss": 0.7887, "step": 2836 }, { "epoch": 0.780843597330214, "grad_norm": 0.20903195163303726, "learning_rate": 4.644226137862782e-06, "loss": 0.7685, "step": 2837 }, { "epoch": 0.7811188330007569, "grad_norm": 0.1984022960436632, "learning_rate": 4.63303718121479e-06, "loss": 0.7549, "step": 2838 }, { "epoch": 0.7813940686712998, "grad_norm": 0.20057372327640952, "learning_rate": 4.621859953226682e-06, "loss": 0.7885, "step": 2839 }, { "epoch": 0.7816693043418427, "grad_norm": 0.1994920525224683, "learning_rate": 4.610694462429337e-06, "loss": 0.7365, "step": 2840 }, { "epoch": 0.7819445400123856, "grad_norm": 0.20514206637741078, "learning_rate": 4.599540717344695e-06, "loss": 0.7638, "step": 2841 }, { "epoch": 0.7822197756829286, "grad_norm": 0.20543267077008986, "learning_rate": 4.588398726485719e-06, "loss": 0.75, "step": 2842 }, { "epoch": 0.7824950113534714, "grad_norm": 0.2030935113631456, "learning_rate": 4.577268498356411e-06, "loss": 0.7855, "step": 2843 }, { "epoch": 0.7827702470240143, "grad_norm": 0.20641867697581046, "learning_rate": 4.5661500414517955e-06, "loss": 0.777, "step": 2844 }, { "epoch": 0.7830454826945572, "grad_norm": 0.20495105897333385, "learning_rate": 4.555043364257894e-06, "loss": 0.7742, "step": 2845 }, { "epoch": 0.7833207183651001, "grad_norm": 0.1941721815018396, "learning_rate": 4.543948475251772e-06, "loss": 0.7553, "step": 2846 }, { "epoch": 0.783595954035643, "grad_norm": 0.20803754750016493, "learning_rate": 4.532865382901461e-06, "loss": 0.7842, "step": 2847 }, { "epoch": 0.783871189706186, "grad_norm": 0.20833329311102658, "learning_rate": 4.521794095666013e-06, "loss": 0.7815, "step": 2848 }, { "epoch": 0.7841464253767289, "grad_norm": 0.1995661810791607, "learning_rate": 4.510734621995465e-06, "loss": 0.7895, "step": 2849 }, { "epoch": 0.7844216610472717, "grad_norm": 0.20743474785424687, "learning_rate": 4.499686970330825e-06, "loss": 0.7634, "step": 2850 }, { "epoch": 0.7846968967178146, "grad_norm": 0.20061320673242355, "learning_rate": 4.4886511491041e-06, "loss": 0.7564, "step": 2851 }, { "epoch": 0.7849721323883575, "grad_norm": 0.19742642178470157, "learning_rate": 4.4776271667382364e-06, "loss": 0.7537, "step": 2852 }, { "epoch": 0.7852473680589004, "grad_norm": 0.303209575871292, "learning_rate": 4.466615031647188e-06, "loss": 0.7715, "step": 2853 }, { "epoch": 0.7855226037294434, "grad_norm": 0.19353483675849117, "learning_rate": 4.455614752235824e-06, "loss": 0.7783, "step": 2854 }, { "epoch": 0.7857978393999863, "grad_norm": 0.20233109926630172, "learning_rate": 4.4446263368999865e-06, "loss": 0.7697, "step": 2855 }, { "epoch": 0.7860730750705291, "grad_norm": 0.25814347319127223, "learning_rate": 4.433649794026467e-06, "loss": 0.7488, "step": 2856 }, { "epoch": 0.786348310741072, "grad_norm": 0.2027664849587621, "learning_rate": 4.422685131992975e-06, "loss": 0.777, "step": 2857 }, { "epoch": 0.7866235464116149, "grad_norm": 0.2075529363301236, "learning_rate": 4.411732359168168e-06, "loss": 0.8007, "step": 2858 }, { "epoch": 0.7868987820821578, "grad_norm": 0.2069726966220343, "learning_rate": 4.40079148391163e-06, "loss": 0.7592, "step": 2859 }, { "epoch": 0.7871740177527008, "grad_norm": 0.19377565222016482, "learning_rate": 4.3898625145738575e-06, "loss": 0.7657, "step": 2860 }, { "epoch": 0.7874492534232437, "grad_norm": 0.19292774395307385, "learning_rate": 4.378945459496264e-06, "loss": 0.7572, "step": 2861 }, { "epoch": 0.7877244890937866, "grad_norm": 0.1927745991170634, "learning_rate": 4.3680403270111645e-06, "loss": 0.7365, "step": 2862 }, { "epoch": 0.7879997247643294, "grad_norm": 0.19572380321966792, "learning_rate": 4.357147125441783e-06, "loss": 0.7647, "step": 2863 }, { "epoch": 0.7882749604348723, "grad_norm": 0.20637964893616226, "learning_rate": 4.346265863102221e-06, "loss": 0.7365, "step": 2864 }, { "epoch": 0.7885501961054152, "grad_norm": 0.1971231960174484, "learning_rate": 4.335396548297485e-06, "loss": 0.7513, "step": 2865 }, { "epoch": 0.7888254317759582, "grad_norm": 0.1929257926222743, "learning_rate": 4.324539189323458e-06, "loss": 0.747, "step": 2866 }, { "epoch": 0.7891006674465011, "grad_norm": 0.2525761325444834, "learning_rate": 4.313693794466893e-06, "loss": 0.7486, "step": 2867 }, { "epoch": 0.789375903117044, "grad_norm": 0.22952195434899925, "learning_rate": 4.302860372005422e-06, "loss": 0.7766, "step": 2868 }, { "epoch": 0.7896511387875869, "grad_norm": 0.2016058593886603, "learning_rate": 4.292038930207518e-06, "loss": 0.7764, "step": 2869 }, { "epoch": 0.7899263744581297, "grad_norm": 0.2038852986604692, "learning_rate": 4.281229477332534e-06, "loss": 0.7685, "step": 2870 }, { "epoch": 0.7902016101286726, "grad_norm": 0.20278325720176432, "learning_rate": 4.270432021630662e-06, "loss": 0.7638, "step": 2871 }, { "epoch": 0.7904768457992156, "grad_norm": 0.19698233401664667, "learning_rate": 4.25964657134293e-06, "loss": 0.7851, "step": 2872 }, { "epoch": 0.7907520814697585, "grad_norm": 0.20035466893421386, "learning_rate": 4.248873134701215e-06, "loss": 0.7702, "step": 2873 }, { "epoch": 0.7910273171403014, "grad_norm": 0.19584400606937383, "learning_rate": 4.238111719928219e-06, "loss": 0.7739, "step": 2874 }, { "epoch": 0.7913025528108443, "grad_norm": 0.21207906139692, "learning_rate": 4.227362335237472e-06, "loss": 0.7425, "step": 2875 }, { "epoch": 0.7915777884813872, "grad_norm": 0.21151286179926834, "learning_rate": 4.216624988833326e-06, "loss": 0.8108, "step": 2876 }, { "epoch": 0.79185302415193, "grad_norm": 0.2584595519581787, "learning_rate": 4.205899688910924e-06, "loss": 0.7767, "step": 2877 }, { "epoch": 0.792128259822473, "grad_norm": 0.2022452169325136, "learning_rate": 4.195186443656241e-06, "loss": 0.7623, "step": 2878 }, { "epoch": 0.7924034954930159, "grad_norm": 0.20441117139199405, "learning_rate": 4.184485261246032e-06, "loss": 0.7968, "step": 2879 }, { "epoch": 0.7926787311635588, "grad_norm": 0.2063763328636017, "learning_rate": 4.1737961498478555e-06, "loss": 0.7875, "step": 2880 }, { "epoch": 0.7929539668341017, "grad_norm": 0.19925364923707437, "learning_rate": 4.163119117620056e-06, "loss": 0.7842, "step": 2881 }, { "epoch": 0.7932292025046446, "grad_norm": 0.20247120914161668, "learning_rate": 4.152454172711755e-06, "loss": 0.7758, "step": 2882 }, { "epoch": 0.7935044381751875, "grad_norm": 0.21223059537589548, "learning_rate": 4.141801323262858e-06, "loss": 0.7941, "step": 2883 }, { "epoch": 0.7937796738457304, "grad_norm": 0.19199658544560622, "learning_rate": 4.131160577404021e-06, "loss": 0.7798, "step": 2884 }, { "epoch": 0.7940549095162733, "grad_norm": 0.20041257542187746, "learning_rate": 4.120531943256676e-06, "loss": 0.7664, "step": 2885 }, { "epoch": 0.7943301451868162, "grad_norm": 0.20165733492992646, "learning_rate": 4.1099154289330134e-06, "loss": 0.7962, "step": 2886 }, { "epoch": 0.7946053808573591, "grad_norm": 0.20314002376987073, "learning_rate": 4.099311042535956e-06, "loss": 0.7696, "step": 2887 }, { "epoch": 0.794880616527902, "grad_norm": 0.20175323515167573, "learning_rate": 4.08871879215919e-06, "loss": 0.749, "step": 2888 }, { "epoch": 0.7951558521984449, "grad_norm": 0.1912925297454833, "learning_rate": 4.078138685887125e-06, "loss": 0.7773, "step": 2889 }, { "epoch": 0.7954310878689879, "grad_norm": 0.19981498598106223, "learning_rate": 4.067570731794915e-06, "loss": 0.7435, "step": 2890 }, { "epoch": 0.7957063235395307, "grad_norm": 0.2824001525870759, "learning_rate": 4.05701493794842e-06, "loss": 0.7497, "step": 2891 }, { "epoch": 0.7959815592100736, "grad_norm": 0.19586364528959677, "learning_rate": 4.0464713124042366e-06, "loss": 0.7549, "step": 2892 }, { "epoch": 0.7962567948806165, "grad_norm": 0.21028430684116986, "learning_rate": 4.03593986320967e-06, "loss": 0.7681, "step": 2893 }, { "epoch": 0.7965320305511594, "grad_norm": 0.2153353658282543, "learning_rate": 4.025420598402721e-06, "loss": 0.7827, "step": 2894 }, { "epoch": 0.7968072662217023, "grad_norm": 0.1980078731555791, "learning_rate": 4.014913526012103e-06, "loss": 0.763, "step": 2895 }, { "epoch": 0.7970825018922453, "grad_norm": 0.19616826789355, "learning_rate": 4.004418654057218e-06, "loss": 0.7448, "step": 2896 }, { "epoch": 0.7973577375627882, "grad_norm": 0.22560287627183578, "learning_rate": 3.993935990548161e-06, "loss": 0.7554, "step": 2897 }, { "epoch": 0.797632973233331, "grad_norm": 0.20765214829922649, "learning_rate": 3.983465543485709e-06, "loss": 0.7949, "step": 2898 }, { "epoch": 0.7979082089038739, "grad_norm": 0.2036517887543124, "learning_rate": 3.973007320861304e-06, "loss": 0.7781, "step": 2899 }, { "epoch": 0.7981834445744168, "grad_norm": 0.2004734367854516, "learning_rate": 3.962561330657073e-06, "loss": 0.7555, "step": 2900 }, { "epoch": 0.7984586802449597, "grad_norm": 0.199398547264568, "learning_rate": 3.952127580845791e-06, "loss": 0.7622, "step": 2901 }, { "epoch": 0.7987339159155027, "grad_norm": 0.20187340908690163, "learning_rate": 3.941706079390897e-06, "loss": 0.7719, "step": 2902 }, { "epoch": 0.7990091515860456, "grad_norm": 0.19831027232711532, "learning_rate": 3.931296834246501e-06, "loss": 0.767, "step": 2903 }, { "epoch": 0.7992843872565885, "grad_norm": 0.20748317754463497, "learning_rate": 3.920899853357325e-06, "loss": 0.7584, "step": 2904 }, { "epoch": 0.7995596229271313, "grad_norm": 0.19223097328129718, "learning_rate": 3.910515144658758e-06, "loss": 0.7867, "step": 2905 }, { "epoch": 0.7998348585976742, "grad_norm": 0.20307266762815543, "learning_rate": 3.9001427160768e-06, "loss": 0.769, "step": 2906 }, { "epoch": 0.8001100942682171, "grad_norm": 0.5339420397855794, "learning_rate": 3.889782575528094e-06, "loss": 0.7565, "step": 2907 }, { "epoch": 0.8003853299387601, "grad_norm": 0.20344106716606247, "learning_rate": 3.879434730919904e-06, "loss": 0.7786, "step": 2908 }, { "epoch": 0.800660565609303, "grad_norm": 0.20038922402801615, "learning_rate": 3.86909919015009e-06, "loss": 0.7768, "step": 2909 }, { "epoch": 0.8009358012798459, "grad_norm": 0.19495880254516534, "learning_rate": 3.858775961107157e-06, "loss": 0.7799, "step": 2910 }, { "epoch": 0.8012110369503888, "grad_norm": 0.19617601320723022, "learning_rate": 3.8484650516701784e-06, "loss": 0.7875, "step": 2911 }, { "epoch": 0.8014862726209316, "grad_norm": 0.1920851553900602, "learning_rate": 3.838166469708844e-06, "loss": 0.7735, "step": 2912 }, { "epoch": 0.8017615082914745, "grad_norm": 0.20857451692256856, "learning_rate": 3.827880223083431e-06, "loss": 0.7998, "step": 2913 }, { "epoch": 0.8020367439620175, "grad_norm": 0.19636130182099734, "learning_rate": 3.817606319644793e-06, "loss": 0.7681, "step": 2914 }, { "epoch": 0.8023119796325604, "grad_norm": 0.2007209095200276, "learning_rate": 3.8073447672343798e-06, "loss": 0.7863, "step": 2915 }, { "epoch": 0.8025872153031033, "grad_norm": 0.2011422358942804, "learning_rate": 3.7970955736841887e-06, "loss": 0.7454, "step": 2916 }, { "epoch": 0.8028624509736462, "grad_norm": 0.20542209496523348, "learning_rate": 3.7868587468168216e-06, "loss": 0.7501, "step": 2917 }, { "epoch": 0.803137686644189, "grad_norm": 0.20360489944609322, "learning_rate": 3.7766342944454047e-06, "loss": 0.7949, "step": 2918 }, { "epoch": 0.8034129223147319, "grad_norm": 0.19787382286866595, "learning_rate": 3.7664222243736404e-06, "loss": 0.7631, "step": 2919 }, { "epoch": 0.8036881579852749, "grad_norm": 0.19776558419990134, "learning_rate": 3.75622254439578e-06, "loss": 0.7485, "step": 2920 }, { "epoch": 0.8039633936558178, "grad_norm": 0.2054346946568972, "learning_rate": 3.7460352622966034e-06, "loss": 0.7716, "step": 2921 }, { "epoch": 0.8042386293263607, "grad_norm": 0.20142581338538534, "learning_rate": 3.735860385851444e-06, "loss": 0.7834, "step": 2922 }, { "epoch": 0.8045138649969036, "grad_norm": 0.1999942983586885, "learning_rate": 3.725697922826166e-06, "loss": 0.7574, "step": 2923 }, { "epoch": 0.8047891006674465, "grad_norm": 0.20633088448915526, "learning_rate": 3.715547880977135e-06, "loss": 0.7621, "step": 2924 }, { "epoch": 0.8050643363379894, "grad_norm": 0.19525380005448217, "learning_rate": 3.7054102680512795e-06, "loss": 0.7787, "step": 2925 }, { "epoch": 0.8053395720085323, "grad_norm": 0.19401713555394456, "learning_rate": 3.6952850917860007e-06, "loss": 0.7663, "step": 2926 }, { "epoch": 0.8056148076790752, "grad_norm": 0.2041512462972966, "learning_rate": 3.685172359909235e-06, "loss": 0.7695, "step": 2927 }, { "epoch": 0.8058900433496181, "grad_norm": 0.19021372442475737, "learning_rate": 3.6750720801394014e-06, "loss": 0.7787, "step": 2928 }, { "epoch": 0.806165279020161, "grad_norm": 0.19983376635489705, "learning_rate": 3.6649842601854245e-06, "loss": 0.7661, "step": 2929 }, { "epoch": 0.8064405146907039, "grad_norm": 0.19094784715680338, "learning_rate": 3.6549089077467258e-06, "loss": 0.7669, "step": 2930 }, { "epoch": 0.8067157503612468, "grad_norm": 0.18971923430952783, "learning_rate": 3.6448460305131916e-06, "loss": 0.7657, "step": 2931 }, { "epoch": 0.8069909860317898, "grad_norm": 0.19290411168702953, "learning_rate": 3.6347956361652135e-06, "loss": 0.7557, "step": 2932 }, { "epoch": 0.8072662217023326, "grad_norm": 0.19465580767708632, "learning_rate": 3.624757732373629e-06, "loss": 0.7351, "step": 2933 }, { "epoch": 0.8075414573728755, "grad_norm": 0.19469910878182503, "learning_rate": 3.6147323267997592e-06, "loss": 0.7553, "step": 2934 }, { "epoch": 0.8078166930434184, "grad_norm": 0.19101637004024657, "learning_rate": 3.6047194270953846e-06, "loss": 0.7664, "step": 2935 }, { "epoch": 0.8080919287139613, "grad_norm": 0.2097603453646194, "learning_rate": 3.5947190409027276e-06, "loss": 0.7646, "step": 2936 }, { "epoch": 0.8083671643845042, "grad_norm": 0.20919170913934443, "learning_rate": 3.584731175854479e-06, "loss": 0.7921, "step": 2937 }, { "epoch": 0.8086424000550472, "grad_norm": 0.1955730678628757, "learning_rate": 3.5747558395737493e-06, "loss": 0.7665, "step": 2938 }, { "epoch": 0.8089176357255901, "grad_norm": 0.20074242203864368, "learning_rate": 3.5647930396741213e-06, "loss": 0.7552, "step": 2939 }, { "epoch": 0.8091928713961329, "grad_norm": 0.20185678062181947, "learning_rate": 3.5548427837595735e-06, "loss": 0.8127, "step": 2940 }, { "epoch": 0.8094681070666758, "grad_norm": 0.19660344000150748, "learning_rate": 3.54490507942453e-06, "loss": 0.7876, "step": 2941 }, { "epoch": 0.8097433427372187, "grad_norm": 0.19445674769325583, "learning_rate": 3.534979934253835e-06, "loss": 0.7555, "step": 2942 }, { "epoch": 0.8100185784077616, "grad_norm": 0.4918896739297948, "learning_rate": 3.5250673558227356e-06, "loss": 0.786, "step": 2943 }, { "epoch": 0.8102938140783046, "grad_norm": 0.20779493364366397, "learning_rate": 3.5151673516968956e-06, "loss": 0.7912, "step": 2944 }, { "epoch": 0.8105690497488475, "grad_norm": 0.19431374838052975, "learning_rate": 3.505279929432386e-06, "loss": 0.7623, "step": 2945 }, { "epoch": 0.8108442854193904, "grad_norm": 0.20518241247885818, "learning_rate": 3.495405096575664e-06, "loss": 0.7666, "step": 2946 }, { "epoch": 0.8111195210899332, "grad_norm": 0.20721678846360644, "learning_rate": 3.485542860663593e-06, "loss": 0.783, "step": 2947 }, { "epoch": 0.8113947567604761, "grad_norm": 0.20064913719736718, "learning_rate": 3.4756932292234e-06, "loss": 0.7949, "step": 2948 }, { "epoch": 0.811669992431019, "grad_norm": 0.19500803217870402, "learning_rate": 3.4658562097727177e-06, "loss": 0.7643, "step": 2949 }, { "epoch": 0.811945228101562, "grad_norm": 0.19439907281721805, "learning_rate": 3.4560318098195244e-06, "loss": 0.7589, "step": 2950 }, { "epoch": 0.8122204637721049, "grad_norm": 0.1951219514353315, "learning_rate": 3.446220036862191e-06, "loss": 0.752, "step": 2951 }, { "epoch": 0.8124956994426478, "grad_norm": 0.19990388577876386, "learning_rate": 3.4364208983894387e-06, "loss": 0.7522, "step": 2952 }, { "epoch": 0.8127709351131907, "grad_norm": 0.19486143805117162, "learning_rate": 3.426634401880351e-06, "loss": 0.7498, "step": 2953 }, { "epoch": 0.8130461707837335, "grad_norm": 0.18819736579265198, "learning_rate": 3.4168605548043663e-06, "loss": 0.7576, "step": 2954 }, { "epoch": 0.8133214064542764, "grad_norm": 0.1927019017067847, "learning_rate": 3.4070993646212493e-06, "loss": 0.7483, "step": 2955 }, { "epoch": 0.8135966421248194, "grad_norm": 0.19342814881717693, "learning_rate": 3.3973508387811237e-06, "loss": 0.7859, "step": 2956 }, { "epoch": 0.8138718777953623, "grad_norm": 0.19795873741353534, "learning_rate": 3.3876149847244454e-06, "loss": 0.7431, "step": 2957 }, { "epoch": 0.8141471134659052, "grad_norm": 0.2014558814393953, "learning_rate": 3.377891809881986e-06, "loss": 0.7834, "step": 2958 }, { "epoch": 0.8144223491364481, "grad_norm": 0.439267306111341, "learning_rate": 3.368181321674853e-06, "loss": 0.7731, "step": 2959 }, { "epoch": 0.814697584806991, "grad_norm": 0.19408651237144176, "learning_rate": 3.3584835275144647e-06, "loss": 0.7895, "step": 2960 }, { "epoch": 0.8149728204775339, "grad_norm": 0.2024694272879404, "learning_rate": 3.348798434802556e-06, "loss": 0.7944, "step": 2961 }, { "epoch": 0.8152480561480768, "grad_norm": 0.19688323788979772, "learning_rate": 3.339126050931165e-06, "loss": 0.7733, "step": 2962 }, { "epoch": 0.8155232918186197, "grad_norm": 0.19720016564533846, "learning_rate": 3.3294663832826204e-06, "loss": 0.7636, "step": 2963 }, { "epoch": 0.8157985274891626, "grad_norm": 0.19631478262680774, "learning_rate": 3.3198194392295636e-06, "loss": 0.7929, "step": 2964 }, { "epoch": 0.8160737631597055, "grad_norm": 0.194271823544458, "learning_rate": 3.3101852261349053e-06, "loss": 0.7771, "step": 2965 }, { "epoch": 0.8163489988302484, "grad_norm": 0.19924369045625256, "learning_rate": 3.300563751351855e-06, "loss": 0.7604, "step": 2966 }, { "epoch": 0.8166242345007914, "grad_norm": 0.19760410232127573, "learning_rate": 3.2909550222238916e-06, "loss": 0.7797, "step": 2967 }, { "epoch": 0.8168994701713342, "grad_norm": 0.196418416252485, "learning_rate": 3.281359046084771e-06, "loss": 0.7804, "step": 2968 }, { "epoch": 0.8171747058418771, "grad_norm": 0.19361302100665764, "learning_rate": 3.271775830258519e-06, "loss": 0.7388, "step": 2969 }, { "epoch": 0.81744994151242, "grad_norm": 0.20038229350070116, "learning_rate": 3.2622053820594025e-06, "loss": 0.773, "step": 2970 }, { "epoch": 0.8177251771829629, "grad_norm": 0.2031306493792435, "learning_rate": 3.252647708791965e-06, "loss": 0.8166, "step": 2971 }, { "epoch": 0.8180004128535058, "grad_norm": 0.197840823564769, "learning_rate": 3.243102817750996e-06, "loss": 0.7912, "step": 2972 }, { "epoch": 0.8182756485240488, "grad_norm": 0.19223881626719536, "learning_rate": 3.233570716221517e-06, "loss": 0.7467, "step": 2973 }, { "epoch": 0.8185508841945917, "grad_norm": 0.1861312366771784, "learning_rate": 3.224051411478799e-06, "loss": 0.7426, "step": 2974 }, { "epoch": 0.8188261198651345, "grad_norm": 0.20161153104256666, "learning_rate": 3.214544910788344e-06, "loss": 0.7794, "step": 2975 }, { "epoch": 0.8191013555356774, "grad_norm": 0.1983209725800102, "learning_rate": 3.205051221405886e-06, "loss": 0.7627, "step": 2976 }, { "epoch": 0.8193765912062203, "grad_norm": 0.19725390707820556, "learning_rate": 3.195570350577366e-06, "loss": 0.7879, "step": 2977 }, { "epoch": 0.8196518268767632, "grad_norm": 0.19682838303602035, "learning_rate": 3.186102305538956e-06, "loss": 0.7984, "step": 2978 }, { "epoch": 0.8199270625473062, "grad_norm": 0.19339250349237413, "learning_rate": 3.176647093517038e-06, "loss": 0.7782, "step": 2979 }, { "epoch": 0.8202022982178491, "grad_norm": 0.1955081265108639, "learning_rate": 3.1672047217281853e-06, "loss": 0.783, "step": 2980 }, { "epoch": 0.820477533888392, "grad_norm": 0.19813172300728882, "learning_rate": 3.157775197379187e-06, "loss": 0.7688, "step": 2981 }, { "epoch": 0.8207527695589348, "grad_norm": 0.1964422364694359, "learning_rate": 3.148358527667019e-06, "loss": 0.7796, "step": 2982 }, { "epoch": 0.8210280052294777, "grad_norm": 0.20253715265668207, "learning_rate": 3.138954719778848e-06, "loss": 0.7783, "step": 2983 }, { "epoch": 0.8213032409000206, "grad_norm": 0.19731673602494068, "learning_rate": 3.1295637808920286e-06, "loss": 0.7714, "step": 2984 }, { "epoch": 0.8215784765705636, "grad_norm": 0.18730586826954426, "learning_rate": 3.1201857181740804e-06, "loss": 0.7644, "step": 2985 }, { "epoch": 0.8218537122411065, "grad_norm": 0.4150346032094395, "learning_rate": 3.1108205387827085e-06, "loss": 0.7828, "step": 2986 }, { "epoch": 0.8221289479116494, "grad_norm": 0.20001015037380662, "learning_rate": 3.1014682498657733e-06, "loss": 0.7583, "step": 2987 }, { "epoch": 0.8224041835821922, "grad_norm": 0.19220897004573384, "learning_rate": 3.0921288585613053e-06, "loss": 0.7742, "step": 2988 }, { "epoch": 0.8226794192527351, "grad_norm": 0.1937326967582978, "learning_rate": 3.0828023719974975e-06, "loss": 0.7888, "step": 2989 }, { "epoch": 0.822954654923278, "grad_norm": 0.19002952589220604, "learning_rate": 3.0734887972926764e-06, "loss": 0.7444, "step": 2990 }, { "epoch": 0.823229890593821, "grad_norm": 0.19429892427198608, "learning_rate": 3.0641881415553266e-06, "loss": 0.773, "step": 2991 }, { "epoch": 0.8235051262643639, "grad_norm": 0.1991475218747388, "learning_rate": 3.0549004118840606e-06, "loss": 0.771, "step": 2992 }, { "epoch": 0.8237803619349068, "grad_norm": 0.19603075579799845, "learning_rate": 3.0456256153676402e-06, "loss": 0.7506, "step": 2993 }, { "epoch": 0.8240555976054497, "grad_norm": 0.19267081961901716, "learning_rate": 3.0363637590849483e-06, "loss": 0.7926, "step": 2994 }, { "epoch": 0.8243308332759925, "grad_norm": 0.1937744240979409, "learning_rate": 3.0271148501049796e-06, "loss": 0.7925, "step": 2995 }, { "epoch": 0.8246060689465354, "grad_norm": 0.1952112705672228, "learning_rate": 3.0178788954868764e-06, "loss": 0.7967, "step": 2996 }, { "epoch": 0.8248813046170784, "grad_norm": 0.18706297543548323, "learning_rate": 3.008655902279867e-06, "loss": 0.7704, "step": 2997 }, { "epoch": 0.8251565402876213, "grad_norm": 0.19281286307768228, "learning_rate": 2.9994458775232947e-06, "loss": 0.7863, "step": 2998 }, { "epoch": 0.8254317759581642, "grad_norm": 0.1940332554826848, "learning_rate": 2.9902488282466135e-06, "loss": 0.783, "step": 2999 }, { "epoch": 0.8257070116287071, "grad_norm": 0.19919472902227528, "learning_rate": 2.981064761469359e-06, "loss": 0.763, "step": 3000 }, { "epoch": 0.82598224729925, "grad_norm": 0.1898812375911402, "learning_rate": 2.9718936842011727e-06, "loss": 0.7741, "step": 3001 }, { "epoch": 0.8262574829697928, "grad_norm": 0.19317549723498484, "learning_rate": 2.962735603441762e-06, "loss": 0.7943, "step": 3002 }, { "epoch": 0.8265327186403358, "grad_norm": 0.4836962372813598, "learning_rate": 2.9535905261809492e-06, "loss": 0.7918, "step": 3003 }, { "epoch": 0.8268079543108787, "grad_norm": 0.2012962845456614, "learning_rate": 2.9444584593985914e-06, "loss": 0.7917, "step": 3004 }, { "epoch": 0.8270831899814216, "grad_norm": 0.18626972790480248, "learning_rate": 2.935339410064646e-06, "loss": 0.7644, "step": 3005 }, { "epoch": 0.8273584256519645, "grad_norm": 0.1929550257006686, "learning_rate": 2.9262333851391234e-06, "loss": 0.7899, "step": 3006 }, { "epoch": 0.8276336613225074, "grad_norm": 0.18986651972753832, "learning_rate": 2.917140391572084e-06, "loss": 0.7416, "step": 3007 }, { "epoch": 0.8279088969930503, "grad_norm": 0.19094706258894267, "learning_rate": 2.908060436303661e-06, "loss": 0.7583, "step": 3008 }, { "epoch": 0.8281841326635933, "grad_norm": 0.19494245808498553, "learning_rate": 2.8989935262640245e-06, "loss": 0.7852, "step": 3009 }, { "epoch": 0.8284593683341361, "grad_norm": 0.1939673339423602, "learning_rate": 2.8899396683733916e-06, "loss": 0.7855, "step": 3010 }, { "epoch": 0.828734604004679, "grad_norm": 0.1922472596497471, "learning_rate": 2.880898869542019e-06, "loss": 0.7747, "step": 3011 }, { "epoch": 0.8290098396752219, "grad_norm": 0.19109372042741662, "learning_rate": 2.871871136670188e-06, "loss": 0.7545, "step": 3012 }, { "epoch": 0.8292850753457648, "grad_norm": 0.19998845581220057, "learning_rate": 2.8628564766482193e-06, "loss": 0.8223, "step": 3013 }, { "epoch": 0.8295603110163077, "grad_norm": 0.18875318151334095, "learning_rate": 2.8538548963564405e-06, "loss": 0.775, "step": 3014 }, { "epoch": 0.8298355466868507, "grad_norm": 0.18877364565375876, "learning_rate": 2.844866402665214e-06, "loss": 0.7682, "step": 3015 }, { "epoch": 0.8301107823573936, "grad_norm": 0.18672888348505698, "learning_rate": 2.8358910024349006e-06, "loss": 0.7456, "step": 3016 }, { "epoch": 0.8303860180279364, "grad_norm": 0.32179740836847887, "learning_rate": 2.8269287025158767e-06, "loss": 0.7346, "step": 3017 }, { "epoch": 0.8306612536984793, "grad_norm": 0.1887537464602044, "learning_rate": 2.8179795097485163e-06, "loss": 0.7658, "step": 3018 }, { "epoch": 0.8309364893690222, "grad_norm": 0.19110158631182372, "learning_rate": 2.8090434309631852e-06, "loss": 0.8016, "step": 3019 }, { "epoch": 0.8312117250395651, "grad_norm": 0.19256974946442487, "learning_rate": 2.8001204729802435e-06, "loss": 0.7815, "step": 3020 }, { "epoch": 0.8314869607101081, "grad_norm": 0.1970175901032695, "learning_rate": 2.791210642610045e-06, "loss": 0.7681, "step": 3021 }, { "epoch": 0.831762196380651, "grad_norm": 0.2015234789428279, "learning_rate": 2.7823139466529082e-06, "loss": 0.7663, "step": 3022 }, { "epoch": 0.8320374320511938, "grad_norm": 0.1931552456652713, "learning_rate": 2.7734303918991367e-06, "loss": 0.7393, "step": 3023 }, { "epoch": 0.8323126677217367, "grad_norm": 0.1982835634815639, "learning_rate": 2.764559985129007e-06, "loss": 0.7899, "step": 3024 }, { "epoch": 0.8325879033922796, "grad_norm": 0.19993477247641417, "learning_rate": 2.7557027331127572e-06, "loss": 0.7483, "step": 3025 }, { "epoch": 0.8328631390628225, "grad_norm": 0.20098094993148255, "learning_rate": 2.746858642610577e-06, "loss": 0.7763, "step": 3026 }, { "epoch": 0.8331383747333655, "grad_norm": 0.19585662373306542, "learning_rate": 2.73802772037262e-06, "loss": 0.7794, "step": 3027 }, { "epoch": 0.8334136104039084, "grad_norm": 0.19508845229334704, "learning_rate": 2.729209973138998e-06, "loss": 0.7656, "step": 3028 }, { "epoch": 0.8336888460744513, "grad_norm": 0.19625263210101154, "learning_rate": 2.720405407639739e-06, "loss": 0.7887, "step": 3029 }, { "epoch": 0.8339640817449941, "grad_norm": 0.1924255004539693, "learning_rate": 2.71161403059484e-06, "loss": 0.7594, "step": 3030 }, { "epoch": 0.834239317415537, "grad_norm": 0.1915629995397315, "learning_rate": 2.7028358487142137e-06, "loss": 0.7801, "step": 3031 }, { "epoch": 0.8345145530860799, "grad_norm": 0.19486619175228329, "learning_rate": 2.6940708686977137e-06, "loss": 0.7872, "step": 3032 }, { "epoch": 0.8347897887566229, "grad_norm": 0.19743173389535998, "learning_rate": 2.6853190972351085e-06, "loss": 0.758, "step": 3033 }, { "epoch": 0.8350650244271658, "grad_norm": 0.19409464429494008, "learning_rate": 2.6765805410060863e-06, "loss": 0.7796, "step": 3034 }, { "epoch": 0.8353402600977087, "grad_norm": 0.19205024981287255, "learning_rate": 2.6678552066802566e-06, "loss": 0.7703, "step": 3035 }, { "epoch": 0.8356154957682516, "grad_norm": 0.18985458380871004, "learning_rate": 2.659143100917121e-06, "loss": 0.7662, "step": 3036 }, { "epoch": 0.8358907314387944, "grad_norm": 0.1936561993222744, "learning_rate": 2.6504442303661027e-06, "loss": 0.7665, "step": 3037 }, { "epoch": 0.8361659671093373, "grad_norm": 0.18802118222778885, "learning_rate": 2.6417586016665174e-06, "loss": 0.771, "step": 3038 }, { "epoch": 0.8364412027798803, "grad_norm": 0.20100424618090773, "learning_rate": 2.6330862214475673e-06, "loss": 0.7877, "step": 3039 }, { "epoch": 0.8367164384504232, "grad_norm": 0.20037185262232557, "learning_rate": 2.624427096328357e-06, "loss": 0.7814, "step": 3040 }, { "epoch": 0.8369916741209661, "grad_norm": 0.19698960155073983, "learning_rate": 2.6157812329178556e-06, "loss": 0.7892, "step": 3041 }, { "epoch": 0.837266909791509, "grad_norm": 0.19251790664222262, "learning_rate": 2.6071486378149225e-06, "loss": 0.7851, "step": 3042 }, { "epoch": 0.8375421454620519, "grad_norm": 0.20020046308820605, "learning_rate": 2.598529317608296e-06, "loss": 0.8155, "step": 3043 }, { "epoch": 0.8378173811325947, "grad_norm": 0.19557821252234994, "learning_rate": 2.5899232788765604e-06, "loss": 0.7396, "step": 3044 }, { "epoch": 0.8380926168031377, "grad_norm": 0.18915899568465921, "learning_rate": 2.581330528188186e-06, "loss": 0.7837, "step": 3045 }, { "epoch": 0.8383678524736806, "grad_norm": 0.19241535460006218, "learning_rate": 2.5727510721014916e-06, "loss": 0.7821, "step": 3046 }, { "epoch": 0.8386430881442235, "grad_norm": 0.1917539653288574, "learning_rate": 2.5641849171646473e-06, "loss": 0.7711, "step": 3047 }, { "epoch": 0.8389183238147664, "grad_norm": 0.19484514647989906, "learning_rate": 2.555632069915681e-06, "loss": 0.7632, "step": 3048 }, { "epoch": 0.8391935594853093, "grad_norm": 0.1926394897604978, "learning_rate": 2.547092536882445e-06, "loss": 0.7314, "step": 3049 }, { "epoch": 0.8394687951558522, "grad_norm": 0.19796859732888455, "learning_rate": 2.5385663245826498e-06, "loss": 0.7662, "step": 3050 }, { "epoch": 0.8397440308263951, "grad_norm": 0.1915626795030087, "learning_rate": 2.530053439523823e-06, "loss": 0.8084, "step": 3051 }, { "epoch": 0.840019266496938, "grad_norm": 0.19644886570060285, "learning_rate": 2.5215538882033296e-06, "loss": 0.7609, "step": 3052 }, { "epoch": 0.8402945021674809, "grad_norm": 0.2167925402527184, "learning_rate": 2.5130676771083585e-06, "loss": 0.7545, "step": 3053 }, { "epoch": 0.8405697378380238, "grad_norm": 0.20080450993439886, "learning_rate": 2.5045948127159105e-06, "loss": 0.7818, "step": 3054 }, { "epoch": 0.8408449735085667, "grad_norm": 0.2092790475384215, "learning_rate": 2.4961353014928103e-06, "loss": 0.7866, "step": 3055 }, { "epoch": 0.8411202091791096, "grad_norm": 0.18866886391908752, "learning_rate": 2.4876891498956758e-06, "loss": 0.7528, "step": 3056 }, { "epoch": 0.8413954448496526, "grad_norm": 0.21095465191219404, "learning_rate": 2.4792563643709367e-06, "loss": 0.8106, "step": 3057 }, { "epoch": 0.8416706805201954, "grad_norm": 0.1888393831679301, "learning_rate": 2.4708369513548293e-06, "loss": 0.7708, "step": 3058 }, { "epoch": 0.8419459161907383, "grad_norm": 0.18666594993816893, "learning_rate": 2.4624309172733597e-06, "loss": 0.7579, "step": 3059 }, { "epoch": 0.8422211518612812, "grad_norm": 0.18384518346947176, "learning_rate": 2.4540382685423535e-06, "loss": 0.7486, "step": 3060 }, { "epoch": 0.8424963875318241, "grad_norm": 0.1842857426357308, "learning_rate": 2.4456590115673963e-06, "loss": 0.7396, "step": 3061 }, { "epoch": 0.842771623202367, "grad_norm": 0.18227634287949585, "learning_rate": 2.437293152743865e-06, "loss": 0.7548, "step": 3062 }, { "epoch": 0.84304685887291, "grad_norm": 0.18846389925830498, "learning_rate": 2.4289406984569008e-06, "loss": 0.7603, "step": 3063 }, { "epoch": 0.8433220945434529, "grad_norm": 0.22281292004390954, "learning_rate": 2.4206016550814227e-06, "loss": 0.7945, "step": 3064 }, { "epoch": 0.8435973302139957, "grad_norm": 0.1879150711883053, "learning_rate": 2.4122760289821144e-06, "loss": 0.7636, "step": 3065 }, { "epoch": 0.8438725658845386, "grad_norm": 0.19575038549231671, "learning_rate": 2.4039638265134045e-06, "loss": 0.7655, "step": 3066 }, { "epoch": 0.8441478015550815, "grad_norm": 0.19200307974339387, "learning_rate": 2.3956650540195024e-06, "loss": 0.7688, "step": 3067 }, { "epoch": 0.8444230372256244, "grad_norm": 0.1944801103609571, "learning_rate": 2.3873797178343417e-06, "loss": 0.752, "step": 3068 }, { "epoch": 0.8446982728961674, "grad_norm": 0.3400995668774979, "learning_rate": 2.3791078242816124e-06, "loss": 0.7687, "step": 3069 }, { "epoch": 0.8449735085667103, "grad_norm": 0.3406649122138235, "learning_rate": 2.370849379674749e-06, "loss": 0.7593, "step": 3070 }, { "epoch": 0.8452487442372532, "grad_norm": 0.18803388959424092, "learning_rate": 2.3626043903169073e-06, "loss": 0.7539, "step": 3071 }, { "epoch": 0.845523979907796, "grad_norm": 0.19313704606149756, "learning_rate": 2.3543728625009885e-06, "loss": 0.7572, "step": 3072 }, { "epoch": 0.8457992155783389, "grad_norm": 0.19190785413980008, "learning_rate": 2.3461548025096015e-06, "loss": 0.7487, "step": 3073 }, { "epoch": 0.8460744512488818, "grad_norm": 0.19146675929192586, "learning_rate": 2.3379502166151015e-06, "loss": 0.7728, "step": 3074 }, { "epoch": 0.8463496869194248, "grad_norm": 0.1853750993600032, "learning_rate": 2.3297591110795437e-06, "loss": 0.7585, "step": 3075 }, { "epoch": 0.8466249225899677, "grad_norm": 0.18446562759880664, "learning_rate": 2.3215814921546853e-06, "loss": 0.7436, "step": 3076 }, { "epoch": 0.8469001582605106, "grad_norm": 0.19129834660475867, "learning_rate": 2.313417366082016e-06, "loss": 0.7819, "step": 3077 }, { "epoch": 0.8471753939310535, "grad_norm": 0.1923504287410237, "learning_rate": 2.3052667390926975e-06, "loss": 0.766, "step": 3078 }, { "epoch": 0.8474506296015963, "grad_norm": 0.18878989305329127, "learning_rate": 2.297129617407612e-06, "loss": 0.7693, "step": 3079 }, { "epoch": 0.8477258652721392, "grad_norm": 0.26253976707159266, "learning_rate": 2.2890060072373288e-06, "loss": 0.7675, "step": 3080 }, { "epoch": 0.8480011009426822, "grad_norm": 0.19511497948672077, "learning_rate": 2.280895914782084e-06, "loss": 0.7673, "step": 3081 }, { "epoch": 0.8482763366132251, "grad_norm": 0.18956334635108352, "learning_rate": 2.2727993462318376e-06, "loss": 0.7595, "step": 3082 }, { "epoch": 0.848551572283768, "grad_norm": 0.19191875617856083, "learning_rate": 2.2647163077661837e-06, "loss": 0.7675, "step": 3083 }, { "epoch": 0.8488268079543109, "grad_norm": 0.1887789355258137, "learning_rate": 2.256646805554419e-06, "loss": 0.7641, "step": 3084 }, { "epoch": 0.8491020436248538, "grad_norm": 0.18819429071318444, "learning_rate": 2.2485908457555027e-06, "loss": 0.7295, "step": 3085 }, { "epoch": 0.8493772792953966, "grad_norm": 0.18802130214960133, "learning_rate": 2.2405484345180438e-06, "loss": 0.7566, "step": 3086 }, { "epoch": 0.8496525149659396, "grad_norm": 0.19439351714579375, "learning_rate": 2.232519577980332e-06, "loss": 0.7339, "step": 3087 }, { "epoch": 0.8499277506364825, "grad_norm": 0.19693049546862057, "learning_rate": 2.224504282270288e-06, "loss": 0.7624, "step": 3088 }, { "epoch": 0.8502029863070254, "grad_norm": 0.1856672561841339, "learning_rate": 2.2165025535055128e-06, "loss": 0.7638, "step": 3089 }, { "epoch": 0.8504782219775683, "grad_norm": 0.2458584839079196, "learning_rate": 2.20851439779322e-06, "loss": 0.7547, "step": 3090 }, { "epoch": 0.8507534576481112, "grad_norm": 0.18778140660727222, "learning_rate": 2.2005398212302853e-06, "loss": 0.7702, "step": 3091 }, { "epoch": 0.851028693318654, "grad_norm": 0.18783755518316525, "learning_rate": 2.192578829903216e-06, "loss": 0.7663, "step": 3092 }, { "epoch": 0.851303928989197, "grad_norm": 0.1950484061634625, "learning_rate": 2.18463142988814e-06, "loss": 0.7838, "step": 3093 }, { "epoch": 0.8515791646597399, "grad_norm": 0.19217844936554562, "learning_rate": 2.176697627250828e-06, "loss": 0.7642, "step": 3094 }, { "epoch": 0.8518544003302828, "grad_norm": 0.19072685885204396, "learning_rate": 2.16877742804666e-06, "loss": 0.7951, "step": 3095 }, { "epoch": 0.8521296360008257, "grad_norm": 0.1887281877148052, "learning_rate": 2.160870838320639e-06, "loss": 0.7711, "step": 3096 }, { "epoch": 0.8524048716713686, "grad_norm": 0.18823243678510676, "learning_rate": 2.152977864107386e-06, "loss": 0.764, "step": 3097 }, { "epoch": 0.8526801073419115, "grad_norm": 0.1905286541883655, "learning_rate": 2.1450985114311163e-06, "loss": 0.7634, "step": 3098 }, { "epoch": 0.8529553430124545, "grad_norm": 0.18700851525890116, "learning_rate": 2.137232786305661e-06, "loss": 0.7843, "step": 3099 }, { "epoch": 0.8532305786829973, "grad_norm": 0.18477582388788863, "learning_rate": 2.1293806947344398e-06, "loss": 0.7641, "step": 3100 }, { "epoch": 0.8535058143535402, "grad_norm": 0.1910166465102131, "learning_rate": 2.1215422427104748e-06, "loss": 0.7712, "step": 3101 }, { "epoch": 0.8537810500240831, "grad_norm": 0.19196299357818025, "learning_rate": 2.1137174362163783e-06, "loss": 0.7778, "step": 3102 }, { "epoch": 0.854056285694626, "grad_norm": 0.21988122045040998, "learning_rate": 2.1059062812243437e-06, "loss": 0.7832, "step": 3103 }, { "epoch": 0.8543315213651689, "grad_norm": 0.19395509727436827, "learning_rate": 2.098108783696149e-06, "loss": 0.7716, "step": 3104 }, { "epoch": 0.8546067570357119, "grad_norm": 0.1902137491034438, "learning_rate": 2.09032494958314e-06, "loss": 0.7617, "step": 3105 }, { "epoch": 0.8548819927062548, "grad_norm": 0.1855414369841561, "learning_rate": 2.0825547848262405e-06, "loss": 0.7504, "step": 3106 }, { "epoch": 0.8551572283767976, "grad_norm": 0.20050767045060103, "learning_rate": 2.0747982953559464e-06, "loss": 0.7775, "step": 3107 }, { "epoch": 0.8554324640473405, "grad_norm": 0.19249308183899058, "learning_rate": 2.0670554870923042e-06, "loss": 0.7588, "step": 3108 }, { "epoch": 0.8557076997178834, "grad_norm": 0.2232603600663087, "learning_rate": 2.0593263659449247e-06, "loss": 0.7739, "step": 3109 }, { "epoch": 0.8559829353884263, "grad_norm": 0.18450780097643896, "learning_rate": 2.0516109378129756e-06, "loss": 0.761, "step": 3110 }, { "epoch": 0.8562581710589693, "grad_norm": 0.18469898200814766, "learning_rate": 2.0439092085851685e-06, "loss": 0.7671, "step": 3111 }, { "epoch": 0.8565334067295122, "grad_norm": 0.18658290947008352, "learning_rate": 2.0362211841397594e-06, "loss": 0.7742, "step": 3112 }, { "epoch": 0.856808642400055, "grad_norm": 0.1861967748207594, "learning_rate": 2.028546870344543e-06, "loss": 0.7398, "step": 3113 }, { "epoch": 0.8570838780705979, "grad_norm": 0.1912475258522535, "learning_rate": 2.0208862730568614e-06, "loss": 0.8127, "step": 3114 }, { "epoch": 0.8573591137411408, "grad_norm": 0.1943396310289039, "learning_rate": 2.01323939812357e-06, "loss": 0.774, "step": 3115 }, { "epoch": 0.8576343494116837, "grad_norm": 0.190822139485722, "learning_rate": 2.0056062513810583e-06, "loss": 0.78, "step": 3116 }, { "epoch": 0.8579095850822267, "grad_norm": 0.1852874973163684, "learning_rate": 1.9979868386552436e-06, "loss": 0.7775, "step": 3117 }, { "epoch": 0.8581848207527696, "grad_norm": 0.19024843533673702, "learning_rate": 1.990381165761557e-06, "loss": 0.7629, "step": 3118 }, { "epoch": 0.8584600564233125, "grad_norm": 0.18781757528629295, "learning_rate": 1.982789238504941e-06, "loss": 0.7609, "step": 3119 }, { "epoch": 0.8587352920938554, "grad_norm": 0.1911685631427986, "learning_rate": 1.975211062679845e-06, "loss": 0.7642, "step": 3120 }, { "epoch": 0.8590105277643982, "grad_norm": 0.19243268059652235, "learning_rate": 1.967646644070229e-06, "loss": 0.778, "step": 3121 }, { "epoch": 0.8592857634349411, "grad_norm": 0.18274446698876182, "learning_rate": 1.960095988449546e-06, "loss": 0.7502, "step": 3122 }, { "epoch": 0.8595609991054841, "grad_norm": 0.18676681567293904, "learning_rate": 1.9525591015807465e-06, "loss": 0.7595, "step": 3123 }, { "epoch": 0.859836234776027, "grad_norm": 0.18750218930674167, "learning_rate": 1.945035989216284e-06, "loss": 0.7646, "step": 3124 }, { "epoch": 0.8601114704465699, "grad_norm": 0.19350729645871378, "learning_rate": 1.937526657098079e-06, "loss": 0.7515, "step": 3125 }, { "epoch": 0.8603867061171128, "grad_norm": 0.18598660251185678, "learning_rate": 1.930031110957551e-06, "loss": 0.7478, "step": 3126 }, { "epoch": 0.8606619417876556, "grad_norm": 0.18797587796946585, "learning_rate": 1.922549356515582e-06, "loss": 0.7358, "step": 3127 }, { "epoch": 0.8609371774581985, "grad_norm": 0.19094824376945202, "learning_rate": 1.915081399482539e-06, "loss": 0.7729, "step": 3128 }, { "epoch": 0.8612124131287415, "grad_norm": 0.1937358458010856, "learning_rate": 1.9076272455582635e-06, "loss": 0.7826, "step": 3129 }, { "epoch": 0.8614876487992844, "grad_norm": 0.39977367190921886, "learning_rate": 1.9001869004320395e-06, "loss": 0.7631, "step": 3130 }, { "epoch": 0.8617628844698273, "grad_norm": 0.18933689560820816, "learning_rate": 1.8927603697826403e-06, "loss": 0.7727, "step": 3131 }, { "epoch": 0.8620381201403702, "grad_norm": 0.187226965661617, "learning_rate": 1.8853476592782717e-06, "loss": 0.7491, "step": 3132 }, { "epoch": 0.8623133558109131, "grad_norm": 0.19019037370739847, "learning_rate": 1.8779487745766034e-06, "loss": 0.7904, "step": 3133 }, { "epoch": 0.862588591481456, "grad_norm": 0.1846789350724278, "learning_rate": 1.870563721324754e-06, "loss": 0.7587, "step": 3134 }, { "epoch": 0.8628638271519989, "grad_norm": 0.18668370106947693, "learning_rate": 1.8631925051592748e-06, "loss": 0.7821, "step": 3135 }, { "epoch": 0.8631390628225418, "grad_norm": 0.19123886465591422, "learning_rate": 1.8558351317061696e-06, "loss": 0.7677, "step": 3136 }, { "epoch": 0.8634142984930847, "grad_norm": 0.19004359662777157, "learning_rate": 1.8484916065808622e-06, "loss": 0.7772, "step": 3137 }, { "epoch": 0.8636895341636276, "grad_norm": 0.18887911554861778, "learning_rate": 1.8411619353882182e-06, "loss": 0.7514, "step": 3138 }, { "epoch": 0.8639647698341705, "grad_norm": 0.18953425988012504, "learning_rate": 1.833846123722529e-06, "loss": 0.7806, "step": 3139 }, { "epoch": 0.8642400055047134, "grad_norm": 0.5388338233803865, "learning_rate": 1.8265441771675019e-06, "loss": 0.7634, "step": 3140 }, { "epoch": 0.8645152411752564, "grad_norm": 0.19444099108786989, "learning_rate": 1.8192561012962673e-06, "loss": 0.7535, "step": 3141 }, { "epoch": 0.8647904768457992, "grad_norm": 0.18353107682211708, "learning_rate": 1.8119819016713624e-06, "loss": 0.7502, "step": 3142 }, { "epoch": 0.8650657125163421, "grad_norm": 0.19021247181972498, "learning_rate": 1.8047215838447397e-06, "loss": 0.7739, "step": 3143 }, { "epoch": 0.865340948186885, "grad_norm": 0.1911367831611288, "learning_rate": 1.7974751533577572e-06, "loss": 0.8046, "step": 3144 }, { "epoch": 0.8656161838574279, "grad_norm": 0.18298775735981768, "learning_rate": 1.7902426157411622e-06, "loss": 0.7714, "step": 3145 }, { "epoch": 0.8658914195279708, "grad_norm": 0.4505798359679756, "learning_rate": 1.783023976515117e-06, "loss": 0.8052, "step": 3146 }, { "epoch": 0.8661666551985138, "grad_norm": 0.19479139866548645, "learning_rate": 1.7758192411891584e-06, "loss": 0.8106, "step": 3147 }, { "epoch": 0.8664418908690567, "grad_norm": 0.1934580864132261, "learning_rate": 1.7686284152622257e-06, "loss": 0.7662, "step": 3148 }, { "epoch": 0.8667171265395995, "grad_norm": 0.18951276942974823, "learning_rate": 1.7614515042226289e-06, "loss": 0.7829, "step": 3149 }, { "epoch": 0.8669923622101424, "grad_norm": 0.18965836215958998, "learning_rate": 1.7542885135480636e-06, "loss": 0.7802, "step": 3150 }, { "epoch": 0.8672675978806853, "grad_norm": 0.18843350597762612, "learning_rate": 1.7471394487056082e-06, "loss": 0.774, "step": 3151 }, { "epoch": 0.8675428335512282, "grad_norm": 0.19853461141996423, "learning_rate": 1.7400043151516955e-06, "loss": 0.7543, "step": 3152 }, { "epoch": 0.8678180692217712, "grad_norm": 0.1883128381789863, "learning_rate": 1.7328831183321448e-06, "loss": 0.7669, "step": 3153 }, { "epoch": 0.8680933048923141, "grad_norm": 0.19324496425162602, "learning_rate": 1.725775863682122e-06, "loss": 0.7964, "step": 3154 }, { "epoch": 0.868368540562857, "grad_norm": 0.18926465310836058, "learning_rate": 1.718682556626161e-06, "loss": 0.7768, "step": 3155 }, { "epoch": 0.8686437762333998, "grad_norm": 0.186403577544894, "learning_rate": 1.7116032025781515e-06, "loss": 0.743, "step": 3156 }, { "epoch": 0.8689190119039427, "grad_norm": 0.18083539936835596, "learning_rate": 1.7045378069413222e-06, "loss": 0.7643, "step": 3157 }, { "epoch": 0.8691942475744856, "grad_norm": 0.1872850661697392, "learning_rate": 1.6974863751082638e-06, "loss": 0.7674, "step": 3158 }, { "epoch": 0.8694694832450286, "grad_norm": 0.19361785999115488, "learning_rate": 1.6904489124608892e-06, "loss": 0.7449, "step": 3159 }, { "epoch": 0.8697447189155715, "grad_norm": 0.19837906360231294, "learning_rate": 1.6834254243704773e-06, "loss": 0.7953, "step": 3160 }, { "epoch": 0.8700199545861144, "grad_norm": 0.18428458602536607, "learning_rate": 1.67641591619762e-06, "loss": 0.7467, "step": 3161 }, { "epoch": 0.8702951902566572, "grad_norm": 0.18759872592212673, "learning_rate": 1.6694203932922404e-06, "loss": 0.7823, "step": 3162 }, { "epoch": 0.8705704259272001, "grad_norm": 0.19769691867793007, "learning_rate": 1.6624388609935981e-06, "loss": 0.7689, "step": 3163 }, { "epoch": 0.870845661597743, "grad_norm": 0.19168604251327903, "learning_rate": 1.6554713246302645e-06, "loss": 0.7857, "step": 3164 }, { "epoch": 0.871120897268286, "grad_norm": 0.18773544811091583, "learning_rate": 1.648517789520132e-06, "loss": 0.7462, "step": 3165 }, { "epoch": 0.8713961329388289, "grad_norm": 0.1935682038227588, "learning_rate": 1.641578260970409e-06, "loss": 0.7864, "step": 3166 }, { "epoch": 0.8716713686093718, "grad_norm": 0.18931453176890442, "learning_rate": 1.6346527442776118e-06, "loss": 0.7459, "step": 3167 }, { "epoch": 0.8719466042799147, "grad_norm": 0.19666816714300905, "learning_rate": 1.6277412447275653e-06, "loss": 0.775, "step": 3168 }, { "epoch": 0.8722218399504575, "grad_norm": 0.1831436738641368, "learning_rate": 1.620843767595388e-06, "loss": 0.7758, "step": 3169 }, { "epoch": 0.8724970756210004, "grad_norm": 0.19154673596513266, "learning_rate": 1.6139603181455022e-06, "loss": 0.7869, "step": 3170 }, { "epoch": 0.8727723112915434, "grad_norm": 0.19284064944952697, "learning_rate": 1.6070909016316271e-06, "loss": 0.7554, "step": 3171 }, { "epoch": 0.8730475469620863, "grad_norm": 0.19356338535767414, "learning_rate": 1.6002355232967603e-06, "loss": 0.7748, "step": 3172 }, { "epoch": 0.8733227826326292, "grad_norm": 0.19480134196135337, "learning_rate": 1.593394188373194e-06, "loss": 0.7846, "step": 3173 }, { "epoch": 0.8735980183031721, "grad_norm": 0.19437637012350067, "learning_rate": 1.586566902082498e-06, "loss": 0.7871, "step": 3174 }, { "epoch": 0.873873253973715, "grad_norm": 0.20935178360694914, "learning_rate": 1.5797536696355287e-06, "loss": 0.7568, "step": 3175 }, { "epoch": 0.8741484896442578, "grad_norm": 0.19348409819622703, "learning_rate": 1.5729544962323972e-06, "loss": 0.7798, "step": 3176 }, { "epoch": 0.8744237253148008, "grad_norm": 0.1945280528371594, "learning_rate": 1.5661693870625017e-06, "loss": 0.7789, "step": 3177 }, { "epoch": 0.8746989609853437, "grad_norm": 0.19121053308573158, "learning_rate": 1.5593983473045017e-06, "loss": 0.7547, "step": 3178 }, { "epoch": 0.8749741966558866, "grad_norm": 0.18661067137349469, "learning_rate": 1.5526413821263097e-06, "loss": 0.7409, "step": 3179 }, { "epoch": 0.8752494323264295, "grad_norm": 0.1867149380573226, "learning_rate": 1.5458984966851077e-06, "loss": 0.7708, "step": 3180 }, { "epoch": 0.8755246679969724, "grad_norm": 0.18681874219225864, "learning_rate": 1.5391696961273228e-06, "loss": 0.7559, "step": 3181 }, { "epoch": 0.8757999036675153, "grad_norm": 0.1827532723605175, "learning_rate": 1.5324549855886405e-06, "loss": 0.7864, "step": 3182 }, { "epoch": 0.8760751393380583, "grad_norm": 0.18886736509452898, "learning_rate": 1.525754370193986e-06, "loss": 0.7458, "step": 3183 }, { "epoch": 0.8763503750086011, "grad_norm": 0.18389280692880228, "learning_rate": 1.5190678550575256e-06, "loss": 0.7757, "step": 3184 }, { "epoch": 0.876625610679144, "grad_norm": 0.18341870730942056, "learning_rate": 1.5123954452826682e-06, "loss": 0.7369, "step": 3185 }, { "epoch": 0.8769008463496869, "grad_norm": 0.19170232393290346, "learning_rate": 1.5057371459620518e-06, "loss": 0.7757, "step": 3186 }, { "epoch": 0.8771760820202298, "grad_norm": 0.18727671536564067, "learning_rate": 1.4990929621775485e-06, "loss": 0.747, "step": 3187 }, { "epoch": 0.8774513176907727, "grad_norm": 0.18804982515158827, "learning_rate": 1.4924628990002576e-06, "loss": 0.7709, "step": 3188 }, { "epoch": 0.8777265533613157, "grad_norm": 0.19030775851831463, "learning_rate": 1.4858469614905003e-06, "loss": 0.7759, "step": 3189 }, { "epoch": 0.8780017890318585, "grad_norm": 0.19100282276274508, "learning_rate": 1.4792451546978171e-06, "loss": 0.7866, "step": 3190 }, { "epoch": 0.8782770247024014, "grad_norm": 0.18943433680658, "learning_rate": 1.4726574836609575e-06, "loss": 0.7883, "step": 3191 }, { "epoch": 0.8785522603729443, "grad_norm": 0.19052339389646153, "learning_rate": 1.4660839534078863e-06, "loss": 0.7429, "step": 3192 }, { "epoch": 0.8788274960434872, "grad_norm": 0.18385255754512617, "learning_rate": 1.4595245689557834e-06, "loss": 0.7684, "step": 3193 }, { "epoch": 0.8791027317140301, "grad_norm": 0.18798321377523702, "learning_rate": 1.4529793353110155e-06, "loss": 0.7868, "step": 3194 }, { "epoch": 0.8793779673845731, "grad_norm": 0.19142201236211664, "learning_rate": 1.446448257469164e-06, "loss": 0.7924, "step": 3195 }, { "epoch": 0.879653203055116, "grad_norm": 0.1886902222370272, "learning_rate": 1.439931340414995e-06, "loss": 0.7838, "step": 3196 }, { "epoch": 0.8799284387256588, "grad_norm": 0.1960848842709848, "learning_rate": 1.4334285891224786e-06, "loss": 0.772, "step": 3197 }, { "epoch": 0.8802036743962017, "grad_norm": 0.18609669227991066, "learning_rate": 1.426940008554758e-06, "loss": 0.7719, "step": 3198 }, { "epoch": 0.8804789100667446, "grad_norm": 0.18753491723105517, "learning_rate": 1.4204656036641717e-06, "loss": 0.7658, "step": 3199 }, { "epoch": 0.8807541457372875, "grad_norm": 0.18702009830021854, "learning_rate": 1.4140053793922403e-06, "loss": 0.757, "step": 3200 }, { "epoch": 0.8810293814078305, "grad_norm": 0.19158634371881353, "learning_rate": 1.4075593406696464e-06, "loss": 0.7774, "step": 3201 }, { "epoch": 0.8813046170783734, "grad_norm": 0.1890662618782119, "learning_rate": 1.4011274924162655e-06, "loss": 0.7826, "step": 3202 }, { "epoch": 0.8815798527489163, "grad_norm": 0.18656116002877973, "learning_rate": 1.3947098395411263e-06, "loss": 0.7795, "step": 3203 }, { "epoch": 0.8818550884194591, "grad_norm": 0.19052416120103913, "learning_rate": 1.388306386942433e-06, "loss": 0.7606, "step": 3204 }, { "epoch": 0.882130324090002, "grad_norm": 0.1810951845191948, "learning_rate": 1.3819171395075515e-06, "loss": 0.766, "step": 3205 }, { "epoch": 0.882405559760545, "grad_norm": 0.18871073079663003, "learning_rate": 1.3755421021129945e-06, "loss": 0.7537, "step": 3206 }, { "epoch": 0.8826807954310879, "grad_norm": 0.18354284584931382, "learning_rate": 1.369181279624443e-06, "loss": 0.7495, "step": 3207 }, { "epoch": 0.8829560311016308, "grad_norm": 0.18181011041140363, "learning_rate": 1.3628346768967183e-06, "loss": 0.7373, "step": 3208 }, { "epoch": 0.8832312667721737, "grad_norm": 0.17894662086945293, "learning_rate": 1.3565022987737897e-06, "loss": 0.7517, "step": 3209 }, { "epoch": 0.8835065024427166, "grad_norm": 0.2013464428332814, "learning_rate": 1.3501841500887846e-06, "loss": 0.7759, "step": 3210 }, { "epoch": 0.8837817381132594, "grad_norm": 0.18302769263600557, "learning_rate": 1.34388023566395e-06, "loss": 0.7858, "step": 3211 }, { "epoch": 0.8840569737838024, "grad_norm": 0.18281235658011175, "learning_rate": 1.3375905603106798e-06, "loss": 0.777, "step": 3212 }, { "epoch": 0.8843322094543453, "grad_norm": 0.18927640171862636, "learning_rate": 1.3313151288294933e-06, "loss": 0.7855, "step": 3213 }, { "epoch": 0.8846074451248882, "grad_norm": 0.18266040214288423, "learning_rate": 1.3250539460100465e-06, "loss": 0.7621, "step": 3214 }, { "epoch": 0.8848826807954311, "grad_norm": 0.1883157042438272, "learning_rate": 1.3188070166311162e-06, "loss": 0.7755, "step": 3215 }, { "epoch": 0.885157916465974, "grad_norm": 0.18478083174657808, "learning_rate": 1.3125743454605932e-06, "loss": 0.7726, "step": 3216 }, { "epoch": 0.8854331521365169, "grad_norm": 0.18882225888077922, "learning_rate": 1.3063559372555056e-06, "loss": 0.7568, "step": 3217 }, { "epoch": 0.8857083878070598, "grad_norm": 0.18861334190849974, "learning_rate": 1.3001517967619704e-06, "loss": 0.7812, "step": 3218 }, { "epoch": 0.8859836234776027, "grad_norm": 0.18786987256621746, "learning_rate": 1.293961928715235e-06, "loss": 0.7429, "step": 3219 }, { "epoch": 0.8862588591481456, "grad_norm": 0.18375418260932208, "learning_rate": 1.287786337839645e-06, "loss": 0.7463, "step": 3220 }, { "epoch": 0.8865340948186885, "grad_norm": 0.18021975385807273, "learning_rate": 1.2816250288486477e-06, "loss": 0.7551, "step": 3221 }, { "epoch": 0.8868093304892314, "grad_norm": 0.1802494592234665, "learning_rate": 1.2754780064447947e-06, "loss": 0.7416, "step": 3222 }, { "epoch": 0.8870845661597743, "grad_norm": 0.1908908424071035, "learning_rate": 1.2693452753197222e-06, "loss": 0.7839, "step": 3223 }, { "epoch": 0.8873598018303173, "grad_norm": 0.17959849574191333, "learning_rate": 1.2632268401541837e-06, "loss": 0.7536, "step": 3224 }, { "epoch": 0.8876350375008601, "grad_norm": 0.18891118267749493, "learning_rate": 1.2571227056179924e-06, "loss": 0.7777, "step": 3225 }, { "epoch": 0.887910273171403, "grad_norm": 0.18665675363449444, "learning_rate": 1.251032876370062e-06, "loss": 0.7445, "step": 3226 }, { "epoch": 0.8881855088419459, "grad_norm": 0.1828392690857017, "learning_rate": 1.244957357058394e-06, "loss": 0.7591, "step": 3227 }, { "epoch": 0.8884607445124888, "grad_norm": 0.18328334499414642, "learning_rate": 1.238896152320046e-06, "loss": 0.7548, "step": 3228 }, { "epoch": 0.8887359801830317, "grad_norm": 0.5178245776751196, "learning_rate": 1.232849266781173e-06, "loss": 0.7691, "step": 3229 }, { "epoch": 0.8890112158535747, "grad_norm": 0.1895180941966564, "learning_rate": 1.22681670505699e-06, "loss": 0.7907, "step": 3230 }, { "epoch": 0.8892864515241176, "grad_norm": 0.18912205805507937, "learning_rate": 1.2207984717517785e-06, "loss": 0.7768, "step": 3231 }, { "epoch": 0.8895616871946604, "grad_norm": 0.1816249035219771, "learning_rate": 1.2147945714588927e-06, "loss": 0.7635, "step": 3232 }, { "epoch": 0.8898369228652033, "grad_norm": 0.186735818696158, "learning_rate": 1.208805008760736e-06, "loss": 0.7789, "step": 3233 }, { "epoch": 0.8901121585357462, "grad_norm": 0.18086490081164475, "learning_rate": 1.2028297882287764e-06, "loss": 0.7433, "step": 3234 }, { "epoch": 0.8903873942062891, "grad_norm": 0.1877576003566219, "learning_rate": 1.19686891442353e-06, "loss": 0.7776, "step": 3235 }, { "epoch": 0.8906626298768321, "grad_norm": 0.17979070869113153, "learning_rate": 1.190922391894569e-06, "loss": 0.765, "step": 3236 }, { "epoch": 0.890937865547375, "grad_norm": 0.18199895198833935, "learning_rate": 1.184990225180509e-06, "loss": 0.7384, "step": 3237 }, { "epoch": 0.8912131012179179, "grad_norm": 0.1853237753597599, "learning_rate": 1.179072418809004e-06, "loss": 0.7709, "step": 3238 }, { "epoch": 0.8914883368884607, "grad_norm": 0.18993235985286852, "learning_rate": 1.1731689772967636e-06, "loss": 0.7671, "step": 3239 }, { "epoch": 0.8917635725590036, "grad_norm": 0.1801498034640683, "learning_rate": 1.167279905149512e-06, "loss": 0.7602, "step": 3240 }, { "epoch": 0.8920388082295465, "grad_norm": 0.17992316087193566, "learning_rate": 1.1614052068620208e-06, "loss": 0.7649, "step": 3241 }, { "epoch": 0.8923140439000895, "grad_norm": 0.18311155414803634, "learning_rate": 1.1555448869180897e-06, "loss": 0.7397, "step": 3242 }, { "epoch": 0.8925892795706324, "grad_norm": 0.1854289238806252, "learning_rate": 1.1496989497905342e-06, "loss": 0.7404, "step": 3243 }, { "epoch": 0.8928645152411753, "grad_norm": 0.18543812283119235, "learning_rate": 1.1438673999412054e-06, "loss": 0.7504, "step": 3244 }, { "epoch": 0.8931397509117182, "grad_norm": 0.18522208090112366, "learning_rate": 1.1380502418209604e-06, "loss": 0.7775, "step": 3245 }, { "epoch": 0.893414986582261, "grad_norm": 0.1773061329002993, "learning_rate": 1.132247479869688e-06, "loss": 0.7415, "step": 3246 }, { "epoch": 0.8936902222528039, "grad_norm": 0.185936720669809, "learning_rate": 1.1264591185162787e-06, "loss": 0.767, "step": 3247 }, { "epoch": 0.8939654579233469, "grad_norm": 0.18110163506955634, "learning_rate": 1.1206851621786275e-06, "loss": 0.7538, "step": 3248 }, { "epoch": 0.8942406935938898, "grad_norm": 0.18123217858386276, "learning_rate": 1.114925615263649e-06, "loss": 0.7726, "step": 3249 }, { "epoch": 0.8945159292644327, "grad_norm": 0.1820051225440009, "learning_rate": 1.1091804821672448e-06, "loss": 0.7715, "step": 3250 }, { "epoch": 0.8947911649349756, "grad_norm": 0.1793018545038116, "learning_rate": 1.1034497672743249e-06, "loss": 0.7782, "step": 3251 }, { "epoch": 0.8950664006055185, "grad_norm": 0.19374990373635947, "learning_rate": 1.0977334749587932e-06, "loss": 0.7709, "step": 3252 }, { "epoch": 0.8953416362760613, "grad_norm": 0.18369560061692597, "learning_rate": 1.0920316095835437e-06, "loss": 0.7898, "step": 3253 }, { "epoch": 0.8956168719466043, "grad_norm": 0.18314631607832818, "learning_rate": 1.0863441755004645e-06, "loss": 0.7599, "step": 3254 }, { "epoch": 0.8958921076171472, "grad_norm": 0.18303900316550475, "learning_rate": 1.0806711770504207e-06, "loss": 0.8034, "step": 3255 }, { "epoch": 0.8961673432876901, "grad_norm": 0.3696098952835843, "learning_rate": 1.0750126185632626e-06, "loss": 0.7584, "step": 3256 }, { "epoch": 0.896442578958233, "grad_norm": 0.1811012885139857, "learning_rate": 1.0693685043578284e-06, "loss": 0.7614, "step": 3257 }, { "epoch": 0.8967178146287759, "grad_norm": 0.17765567803019613, "learning_rate": 1.0637388387419146e-06, "loss": 0.7311, "step": 3258 }, { "epoch": 0.8969930502993188, "grad_norm": 0.18584465651559434, "learning_rate": 1.058123626012304e-06, "loss": 0.7791, "step": 3259 }, { "epoch": 0.8972682859698617, "grad_norm": 0.18131319131955945, "learning_rate": 1.0525228704547464e-06, "loss": 0.7743, "step": 3260 }, { "epoch": 0.8975435216404046, "grad_norm": 0.18550276417431893, "learning_rate": 1.0469365763439532e-06, "loss": 0.7827, "step": 3261 }, { "epoch": 0.8978187573109475, "grad_norm": 0.18588065751800437, "learning_rate": 1.0413647479435962e-06, "loss": 0.762, "step": 3262 }, { "epoch": 0.8980939929814904, "grad_norm": 0.1826564939801497, "learning_rate": 1.0358073895063136e-06, "loss": 0.7549, "step": 3263 }, { "epoch": 0.8983692286520333, "grad_norm": 0.17734120314937488, "learning_rate": 1.0302645052736992e-06, "loss": 0.7505, "step": 3264 }, { "epoch": 0.8986444643225762, "grad_norm": 0.18788877403336884, "learning_rate": 1.0247360994762888e-06, "loss": 0.7625, "step": 3265 }, { "epoch": 0.8989196999931192, "grad_norm": 0.1851095058278748, "learning_rate": 1.0192221763335807e-06, "loss": 0.7523, "step": 3266 }, { "epoch": 0.899194935663662, "grad_norm": 0.18912561658909227, "learning_rate": 1.0137227400540128e-06, "loss": 0.7689, "step": 3267 }, { "epoch": 0.8994701713342049, "grad_norm": 0.18326911598950926, "learning_rate": 1.0082377948349653e-06, "loss": 0.7805, "step": 3268 }, { "epoch": 0.8997454070047478, "grad_norm": 0.1824311163067431, "learning_rate": 1.0027673448627673e-06, "loss": 0.7798, "step": 3269 }, { "epoch": 0.9000206426752907, "grad_norm": 0.18526199323055903, "learning_rate": 9.9731139431267e-07, "loss": 0.7771, "step": 3270 }, { "epoch": 0.9002958783458336, "grad_norm": 0.18144373701305452, "learning_rate": 9.918699473488714e-07, "loss": 0.7378, "step": 3271 }, { "epoch": 0.9005711140163766, "grad_norm": 0.18700031340046053, "learning_rate": 9.864430081244892e-07, "loss": 0.7962, "step": 3272 }, { "epoch": 0.9008463496869195, "grad_norm": 0.17872148115622588, "learning_rate": 9.810305807815746e-07, "loss": 0.7409, "step": 3273 }, { "epoch": 0.9011215853574623, "grad_norm": 0.18592891265931816, "learning_rate": 9.75632669451101e-07, "loss": 0.7944, "step": 3274 }, { "epoch": 0.9013968210280052, "grad_norm": 0.1830787386929491, "learning_rate": 9.702492782529637e-07, "loss": 0.7658, "step": 3275 }, { "epoch": 0.9016720566985481, "grad_norm": 0.18752019297533465, "learning_rate": 9.648804112959786e-07, "loss": 0.7909, "step": 3276 }, { "epoch": 0.901947292369091, "grad_norm": 0.2808642778861853, "learning_rate": 9.595260726778678e-07, "loss": 0.7551, "step": 3277 }, { "epoch": 0.902222528039634, "grad_norm": 0.1816910627654414, "learning_rate": 9.541862664852686e-07, "loss": 0.7534, "step": 3278 }, { "epoch": 0.9024977637101769, "grad_norm": 0.1851249713793175, "learning_rate": 9.488609967937323e-07, "loss": 0.7766, "step": 3279 }, { "epoch": 0.9027729993807198, "grad_norm": 0.18764193289975062, "learning_rate": 9.435502676677011e-07, "loss": 0.7863, "step": 3280 }, { "epoch": 0.9030482350512626, "grad_norm": 0.18253717110615045, "learning_rate": 9.382540831605413e-07, "loss": 0.7573, "step": 3281 }, { "epoch": 0.9033234707218055, "grad_norm": 0.1840378750178227, "learning_rate": 9.329724473144974e-07, "loss": 0.7633, "step": 3282 }, { "epoch": 0.9035987063923484, "grad_norm": 0.1874412097016048, "learning_rate": 9.277053641607225e-07, "loss": 0.7579, "step": 3283 }, { "epoch": 0.9038739420628914, "grad_norm": 0.19205818139391206, "learning_rate": 9.224528377192543e-07, "loss": 0.7862, "step": 3284 }, { "epoch": 0.9041491777334343, "grad_norm": 0.18176755943939715, "learning_rate": 9.172148719990237e-07, "loss": 0.7812, "step": 3285 }, { "epoch": 0.9044244134039772, "grad_norm": 0.18335505958143317, "learning_rate": 9.119914709978528e-07, "loss": 0.7919, "step": 3286 }, { "epoch": 0.90469964907452, "grad_norm": 0.2768791477738455, "learning_rate": 9.067826387024347e-07, "loss": 0.742, "step": 3287 }, { "epoch": 0.9049748847450629, "grad_norm": 0.18882881787581965, "learning_rate": 9.015883790883629e-07, "loss": 0.7872, "step": 3288 }, { "epoch": 0.9052501204156058, "grad_norm": 0.18412990631147902, "learning_rate": 8.964086961200902e-07, "loss": 0.7766, "step": 3289 }, { "epoch": 0.9055253560861488, "grad_norm": 0.1794471080088259, "learning_rate": 8.912435937509501e-07, "loss": 0.7722, "step": 3290 }, { "epoch": 0.9058005917566917, "grad_norm": 0.18039677462433537, "learning_rate": 8.860930759231534e-07, "loss": 0.7598, "step": 3291 }, { "epoch": 0.9060758274272346, "grad_norm": 0.18257445640321843, "learning_rate": 8.809571465677691e-07, "loss": 0.7792, "step": 3292 }, { "epoch": 0.9063510630977775, "grad_norm": 0.18575558603530684, "learning_rate": 8.758358096047414e-07, "loss": 0.7628, "step": 3293 }, { "epoch": 0.9066262987683203, "grad_norm": 0.318222667109141, "learning_rate": 8.70729068942866e-07, "loss": 0.7833, "step": 3294 }, { "epoch": 0.9069015344388632, "grad_norm": 0.18547318358208326, "learning_rate": 8.656369284798071e-07, "loss": 0.7788, "step": 3295 }, { "epoch": 0.9071767701094062, "grad_norm": 0.1805376834815158, "learning_rate": 8.605593921020917e-07, "loss": 0.7593, "step": 3296 }, { "epoch": 0.9074520057799491, "grad_norm": 0.18492252898820455, "learning_rate": 8.554964636850815e-07, "loss": 0.7482, "step": 3297 }, { "epoch": 0.907727241450492, "grad_norm": 0.18150363230715091, "learning_rate": 8.504481470930037e-07, "loss": 0.7862, "step": 3298 }, { "epoch": 0.9080024771210349, "grad_norm": 0.17997984413775153, "learning_rate": 8.454144461789271e-07, "loss": 0.7625, "step": 3299 }, { "epoch": 0.9082777127915778, "grad_norm": 0.17774947845574218, "learning_rate": 8.403953647847674e-07, "loss": 0.7526, "step": 3300 }, { "epoch": 0.9085529484621206, "grad_norm": 0.18690105830088002, "learning_rate": 8.353909067412824e-07, "loss": 0.7713, "step": 3301 }, { "epoch": 0.9088281841326636, "grad_norm": 0.1837051954473274, "learning_rate": 8.304010758680614e-07, "loss": 0.7865, "step": 3302 }, { "epoch": 0.9091034198032065, "grad_norm": 0.18216246580530657, "learning_rate": 8.254258759735468e-07, "loss": 0.7659, "step": 3303 }, { "epoch": 0.9093786554737494, "grad_norm": 0.1829953371094423, "learning_rate": 8.204653108549965e-07, "loss": 0.7548, "step": 3304 }, { "epoch": 0.9096538911442923, "grad_norm": 0.17915974817091224, "learning_rate": 8.155193842985066e-07, "loss": 0.7875, "step": 3305 }, { "epoch": 0.9099291268148352, "grad_norm": 0.17946220063968904, "learning_rate": 8.105881000790016e-07, "loss": 0.7744, "step": 3306 }, { "epoch": 0.9102043624853781, "grad_norm": 0.18736348652154028, "learning_rate": 8.056714619602246e-07, "loss": 0.7744, "step": 3307 }, { "epoch": 0.910479598155921, "grad_norm": 0.18232306632741607, "learning_rate": 8.007694736947491e-07, "loss": 0.7693, "step": 3308 }, { "epoch": 0.9107548338264639, "grad_norm": 0.18063025095043908, "learning_rate": 7.958821390239535e-07, "loss": 0.756, "step": 3309 }, { "epoch": 0.9110300694970068, "grad_norm": 0.1801797098842179, "learning_rate": 7.910094616780495e-07, "loss": 0.7509, "step": 3310 }, { "epoch": 0.9113053051675497, "grad_norm": 0.1835824285114398, "learning_rate": 7.861514453760466e-07, "loss": 0.7738, "step": 3311 }, { "epoch": 0.9115805408380926, "grad_norm": 0.1813721539661817, "learning_rate": 7.813080938257722e-07, "loss": 0.7859, "step": 3312 }, { "epoch": 0.9118557765086355, "grad_norm": 0.1819370221825069, "learning_rate": 7.764794107238627e-07, "loss": 0.7584, "step": 3313 }, { "epoch": 0.9121310121791785, "grad_norm": 0.18301458943707027, "learning_rate": 7.716653997557521e-07, "loss": 0.7881, "step": 3314 }, { "epoch": 0.9124062478497214, "grad_norm": 0.17943713213846424, "learning_rate": 7.668660645956794e-07, "loss": 0.7649, "step": 3315 }, { "epoch": 0.9126814835202642, "grad_norm": 0.1810842977738103, "learning_rate": 7.62081408906683e-07, "loss": 0.7749, "step": 3316 }, { "epoch": 0.9129567191908071, "grad_norm": 0.17991730122011682, "learning_rate": 7.573114363405976e-07, "loss": 0.7736, "step": 3317 }, { "epoch": 0.91323195486135, "grad_norm": 0.177238683292289, "learning_rate": 7.52556150538053e-07, "loss": 0.7663, "step": 3318 }, { "epoch": 0.9135071905318929, "grad_norm": 0.1822944602073359, "learning_rate": 7.478155551284638e-07, "loss": 0.7536, "step": 3319 }, { "epoch": 0.9137824262024359, "grad_norm": 0.1814977801758155, "learning_rate": 7.430896537300381e-07, "loss": 0.7387, "step": 3320 }, { "epoch": 0.9140576618729788, "grad_norm": 0.17747273869824762, "learning_rate": 7.383784499497637e-07, "loss": 0.7648, "step": 3321 }, { "epoch": 0.9143328975435216, "grad_norm": 0.18445974312771046, "learning_rate": 7.336819473834134e-07, "loss": 0.8025, "step": 3322 }, { "epoch": 0.9146081332140645, "grad_norm": 0.18042862375122798, "learning_rate": 7.290001496155418e-07, "loss": 0.7722, "step": 3323 }, { "epoch": 0.9148833688846074, "grad_norm": 0.1829255169316263, "learning_rate": 7.243330602194754e-07, "loss": 0.8095, "step": 3324 }, { "epoch": 0.9151586045551503, "grad_norm": 0.18009375165013491, "learning_rate": 7.196806827573222e-07, "loss": 0.7736, "step": 3325 }, { "epoch": 0.9154338402256933, "grad_norm": 0.18169937563497035, "learning_rate": 7.150430207799486e-07, "loss": 0.795, "step": 3326 }, { "epoch": 0.9157090758962362, "grad_norm": 0.1788925426445409, "learning_rate": 7.104200778270032e-07, "loss": 0.7684, "step": 3327 }, { "epoch": 0.9159843115667791, "grad_norm": 0.18003129703576484, "learning_rate": 7.058118574268969e-07, "loss": 0.7671, "step": 3328 }, { "epoch": 0.916259547237322, "grad_norm": 0.18689407052760498, "learning_rate": 7.012183630967939e-07, "loss": 0.8137, "step": 3329 }, { "epoch": 0.9165347829078648, "grad_norm": 0.18569610275362458, "learning_rate": 6.966395983426299e-07, "loss": 0.7733, "step": 3330 }, { "epoch": 0.9168100185784077, "grad_norm": 0.18134155747735997, "learning_rate": 6.920755666590961e-07, "loss": 0.7843, "step": 3331 }, { "epoch": 0.9170852542489507, "grad_norm": 0.17665921722270503, "learning_rate": 6.875262715296393e-07, "loss": 0.775, "step": 3332 }, { "epoch": 0.9173604899194936, "grad_norm": 0.25057512052343545, "learning_rate": 6.829917164264554e-07, "loss": 0.785, "step": 3333 }, { "epoch": 0.9176357255900365, "grad_norm": 0.18372676596852336, "learning_rate": 6.784719048104915e-07, "loss": 0.7837, "step": 3334 }, { "epoch": 0.9179109612605794, "grad_norm": 0.21021110732434006, "learning_rate": 6.739668401314459e-07, "loss": 0.7666, "step": 3335 }, { "epoch": 0.9181861969311222, "grad_norm": 0.17999276756879717, "learning_rate": 6.694765258277524e-07, "loss": 0.7575, "step": 3336 }, { "epoch": 0.9184614326016651, "grad_norm": 0.18350298484920563, "learning_rate": 6.650009653265965e-07, "loss": 0.7385, "step": 3337 }, { "epoch": 0.9187366682722081, "grad_norm": 0.19485152267127173, "learning_rate": 6.605401620438967e-07, "loss": 0.7859, "step": 3338 }, { "epoch": 0.919011903942751, "grad_norm": 0.183347840395693, "learning_rate": 6.560941193843118e-07, "loss": 0.7595, "step": 3339 }, { "epoch": 0.9192871396132939, "grad_norm": 0.18164410805064746, "learning_rate": 6.516628407412362e-07, "loss": 0.7729, "step": 3340 }, { "epoch": 0.9195623752838368, "grad_norm": 0.18128818734487367, "learning_rate": 6.47246329496789e-07, "loss": 0.7729, "step": 3341 }, { "epoch": 0.9198376109543797, "grad_norm": 0.1754103639942013, "learning_rate": 6.428445890218205e-07, "loss": 0.7674, "step": 3342 }, { "epoch": 0.9201128466249225, "grad_norm": 0.18003330952170885, "learning_rate": 6.384576226759165e-07, "loss": 0.7492, "step": 3343 }, { "epoch": 0.9203880822954655, "grad_norm": 0.188265865346989, "learning_rate": 6.340854338073699e-07, "loss": 0.8079, "step": 3344 }, { "epoch": 0.9206633179660084, "grad_norm": 0.18198473434318366, "learning_rate": 6.297280257532112e-07, "loss": 0.7815, "step": 3345 }, { "epoch": 0.9209385536365513, "grad_norm": 0.1886024420024592, "learning_rate": 6.25385401839178e-07, "loss": 0.7756, "step": 3346 }, { "epoch": 0.9212137893070942, "grad_norm": 0.17627773507695044, "learning_rate": 6.210575653797346e-07, "loss": 0.7598, "step": 3347 }, { "epoch": 0.9214890249776371, "grad_norm": 0.18682397363854383, "learning_rate": 6.167445196780475e-07, "loss": 0.7988, "step": 3348 }, { "epoch": 0.92176426064818, "grad_norm": 0.1814496775862832, "learning_rate": 6.124462680260035e-07, "loss": 0.7625, "step": 3349 }, { "epoch": 0.922039496318723, "grad_norm": 0.18931687425075752, "learning_rate": 6.081628137041917e-07, "loss": 0.7709, "step": 3350 }, { "epoch": 0.9223147319892658, "grad_norm": 0.18573460812678885, "learning_rate": 6.038941599819104e-07, "loss": 0.7779, "step": 3351 }, { "epoch": 0.9225899676598087, "grad_norm": 0.18600778689760136, "learning_rate": 5.996403101171622e-07, "loss": 0.7779, "step": 3352 }, { "epoch": 0.9228652033303516, "grad_norm": 0.1794970277809988, "learning_rate": 5.954012673566479e-07, "loss": 0.772, "step": 3353 }, { "epoch": 0.9231404390008945, "grad_norm": 0.1776580578157053, "learning_rate": 5.911770349357704e-07, "loss": 0.7636, "step": 3354 }, { "epoch": 0.9234156746714374, "grad_norm": 0.17259111731943394, "learning_rate": 5.869676160786308e-07, "loss": 0.7531, "step": 3355 }, { "epoch": 0.9236909103419804, "grad_norm": 0.18573617522316438, "learning_rate": 5.827730139980125e-07, "loss": 0.7519, "step": 3356 }, { "epoch": 0.9239661460125232, "grad_norm": 0.17880114121294402, "learning_rate": 5.785932318954035e-07, "loss": 0.781, "step": 3357 }, { "epoch": 0.9242413816830661, "grad_norm": 0.18487770952312754, "learning_rate": 5.744282729609696e-07, "loss": 0.7633, "step": 3358 }, { "epoch": 0.924516617353609, "grad_norm": 0.18177462817697282, "learning_rate": 5.702781403735746e-07, "loss": 0.7517, "step": 3359 }, { "epoch": 0.9247918530241519, "grad_norm": 0.29856715115902766, "learning_rate": 5.66142837300756e-07, "loss": 0.7806, "step": 3360 }, { "epoch": 0.9250670886946948, "grad_norm": 0.18204168450450275, "learning_rate": 5.620223668987379e-07, "loss": 0.7817, "step": 3361 }, { "epoch": 0.9253423243652378, "grad_norm": 0.1780772942381337, "learning_rate": 5.579167323124268e-07, "loss": 0.761, "step": 3362 }, { "epoch": 0.9256175600357807, "grad_norm": 0.17719504717407475, "learning_rate": 5.53825936675394e-07, "loss": 0.7488, "step": 3363 }, { "epoch": 0.9258927957063235, "grad_norm": 0.18664310097507678, "learning_rate": 5.497499831098974e-07, "loss": 0.7939, "step": 3364 }, { "epoch": 0.9261680313768664, "grad_norm": 0.17977919885884985, "learning_rate": 5.456888747268641e-07, "loss": 0.7688, "step": 3365 }, { "epoch": 0.9264432670474093, "grad_norm": 0.1773780216528953, "learning_rate": 5.416426146258835e-07, "loss": 0.7589, "step": 3366 }, { "epoch": 0.9267185027179522, "grad_norm": 0.18386694009127807, "learning_rate": 5.376112058952232e-07, "loss": 0.7802, "step": 3367 }, { "epoch": 0.9269937383884952, "grad_norm": 0.18115229126900081, "learning_rate": 5.33594651611804e-07, "loss": 0.7505, "step": 3368 }, { "epoch": 0.9272689740590381, "grad_norm": 0.1817925842703226, "learning_rate": 5.295929548412227e-07, "loss": 0.7558, "step": 3369 }, { "epoch": 0.927544209729581, "grad_norm": 0.18434080805609537, "learning_rate": 5.256061186377226e-07, "loss": 0.7528, "step": 3370 }, { "epoch": 0.9278194454001238, "grad_norm": 0.18799234590140312, "learning_rate": 5.216341460442143e-07, "loss": 0.7779, "step": 3371 }, { "epoch": 0.9280946810706667, "grad_norm": 0.18458407979936434, "learning_rate": 5.176770400922614e-07, "loss": 0.8223, "step": 3372 }, { "epoch": 0.9283699167412096, "grad_norm": 0.1753178844170801, "learning_rate": 5.137348038020751e-07, "loss": 0.7469, "step": 3373 }, { "epoch": 0.9286451524117526, "grad_norm": 0.1778575071911813, "learning_rate": 5.098074401825282e-07, "loss": 0.7538, "step": 3374 }, { "epoch": 0.9289203880822955, "grad_norm": 0.1783933538909113, "learning_rate": 5.05894952231134e-07, "loss": 0.7656, "step": 3375 }, { "epoch": 0.9291956237528384, "grad_norm": 0.18309823615905257, "learning_rate": 5.019973429340552e-07, "loss": 0.7774, "step": 3376 }, { "epoch": 0.9294708594233813, "grad_norm": 0.18051321250726957, "learning_rate": 4.981146152661009e-07, "loss": 0.7804, "step": 3377 }, { "epoch": 0.9297460950939241, "grad_norm": 0.18438418164768278, "learning_rate": 4.942467721907118e-07, "loss": 0.789, "step": 3378 }, { "epoch": 0.930021330764467, "grad_norm": 0.17827807824858685, "learning_rate": 4.903938166599797e-07, "loss": 0.7528, "step": 3379 }, { "epoch": 0.93029656643501, "grad_norm": 0.18112310348622002, "learning_rate": 4.865557516146258e-07, "loss": 0.7756, "step": 3380 }, { "epoch": 0.9305718021055529, "grad_norm": 0.1813875849515388, "learning_rate": 4.827325799840155e-07, "loss": 0.7704, "step": 3381 }, { "epoch": 0.9308470377760958, "grad_norm": 0.182498966296108, "learning_rate": 4.78924304686137e-07, "loss": 0.8009, "step": 3382 }, { "epoch": 0.9311222734466387, "grad_norm": 0.18239337308281747, "learning_rate": 4.75130928627614e-07, "loss": 0.7639, "step": 3383 }, { "epoch": 0.9313975091171816, "grad_norm": 0.18190491248200163, "learning_rate": 4.713524547036996e-07, "loss": 0.7566, "step": 3384 }, { "epoch": 0.9316727447877244, "grad_norm": 0.1822250097338557, "learning_rate": 4.675888857982669e-07, "loss": 0.8163, "step": 3385 }, { "epoch": 0.9319479804582674, "grad_norm": 0.18217006472188224, "learning_rate": 4.638402247838203e-07, "loss": 0.7822, "step": 3386 }, { "epoch": 0.9322232161288103, "grad_norm": 0.1781627244840082, "learning_rate": 4.6010647452148005e-07, "loss": 0.7686, "step": 3387 }, { "epoch": 0.9324984517993532, "grad_norm": 0.17740412794334748, "learning_rate": 4.5638763786099324e-07, "loss": 0.7596, "step": 3388 }, { "epoch": 0.9327736874698961, "grad_norm": 0.18587976864248756, "learning_rate": 4.526837176407162e-07, "loss": 0.7775, "step": 3389 }, { "epoch": 0.933048923140439, "grad_norm": 0.18231904518557465, "learning_rate": 4.4899471668762517e-07, "loss": 0.7714, "step": 3390 }, { "epoch": 0.9333241588109819, "grad_norm": 0.177209197535535, "learning_rate": 4.4532063781730585e-07, "loss": 0.7742, "step": 3391 }, { "epoch": 0.9335993944815248, "grad_norm": 0.17835140414163642, "learning_rate": 4.416614838339639e-07, "loss": 0.7596, "step": 3392 }, { "epoch": 0.9338746301520677, "grad_norm": 0.17697842072296835, "learning_rate": 4.380172575303987e-07, "loss": 0.7368, "step": 3393 }, { "epoch": 0.9341498658226106, "grad_norm": 0.17759407218544426, "learning_rate": 4.3438796168802753e-07, "loss": 0.7529, "step": 3394 }, { "epoch": 0.9344251014931535, "grad_norm": 0.1837684811218352, "learning_rate": 4.307735990768702e-07, "loss": 0.7512, "step": 3395 }, { "epoch": 0.9347003371636964, "grad_norm": 0.1825328000991451, "learning_rate": 4.2717417245555113e-07, "loss": 0.7759, "step": 3396 }, { "epoch": 0.9349755728342393, "grad_norm": 0.178967684743946, "learning_rate": 4.2358968457128615e-07, "loss": 0.7585, "step": 3397 }, { "epoch": 0.9352508085047823, "grad_norm": 0.17499958571737417, "learning_rate": 4.200201381598956e-07, "loss": 0.7743, "step": 3398 }, { "epoch": 0.9355260441753251, "grad_norm": 0.18041431537799826, "learning_rate": 4.164655359458003e-07, "loss": 0.7535, "step": 3399 }, { "epoch": 0.935801279845868, "grad_norm": 0.2035188034133509, "learning_rate": 4.1292588064200334e-07, "loss": 0.775, "step": 3400 }, { "epoch": 0.9360765155164109, "grad_norm": 0.18313380673794988, "learning_rate": 4.094011749501103e-07, "loss": 0.7803, "step": 3401 }, { "epoch": 0.9363517511869538, "grad_norm": 0.21531022816264084, "learning_rate": 4.0589142156031156e-07, "loss": 0.7567, "step": 3402 }, { "epoch": 0.9366269868574967, "grad_norm": 0.18203357333827486, "learning_rate": 4.023966231513887e-07, "loss": 0.7652, "step": 3403 }, { "epoch": 0.9369022225280397, "grad_norm": 0.18230096273909116, "learning_rate": 3.9891678239070586e-07, "loss": 0.7942, "step": 3404 }, { "epoch": 0.9371774581985826, "grad_norm": 0.17834518000836627, "learning_rate": 3.9545190193420955e-07, "loss": 0.774, "step": 3405 }, { "epoch": 0.9374526938691254, "grad_norm": 0.17894154877984061, "learning_rate": 3.920019844264356e-07, "loss": 0.7573, "step": 3406 }, { "epoch": 0.9377279295396683, "grad_norm": 0.17798035810326152, "learning_rate": 3.8856703250048866e-07, "loss": 0.7611, "step": 3407 }, { "epoch": 0.9380031652102112, "grad_norm": 0.18057571880520498, "learning_rate": 3.8514704877805844e-07, "loss": 0.7588, "step": 3408 }, { "epoch": 0.9382784008807541, "grad_norm": 0.18886857810380955, "learning_rate": 3.817420358694102e-07, "loss": 0.7936, "step": 3409 }, { "epoch": 0.9385536365512971, "grad_norm": 0.18383807064836205, "learning_rate": 3.783519963733806e-07, "loss": 0.7722, "step": 3410 }, { "epoch": 0.93882887222184, "grad_norm": 0.2625029611934594, "learning_rate": 3.7497693287738e-07, "loss": 0.7902, "step": 3411 }, { "epoch": 0.9391041078923829, "grad_norm": 0.17861620442865095, "learning_rate": 3.716168479573834e-07, "loss": 0.7437, "step": 3412 }, { "epoch": 0.9393793435629257, "grad_norm": 0.2251790915406425, "learning_rate": 3.6827174417794153e-07, "loss": 0.7849, "step": 3413 }, { "epoch": 0.9396545792334686, "grad_norm": 0.17782609807406816, "learning_rate": 3.649416240921677e-07, "loss": 0.7689, "step": 3414 }, { "epoch": 0.9399298149040115, "grad_norm": 0.17468421444719337, "learning_rate": 3.6162649024173327e-07, "loss": 0.7336, "step": 3415 }, { "epoch": 0.9402050505745545, "grad_norm": 0.2308708162147, "learning_rate": 3.583263451568808e-07, "loss": 0.7678, "step": 3416 }, { "epoch": 0.9404802862450974, "grad_norm": 0.17777308245293016, "learning_rate": 3.550411913564067e-07, "loss": 0.7612, "step": 3417 }, { "epoch": 0.9407555219156403, "grad_norm": 0.18466314423449567, "learning_rate": 3.517710313476652e-07, "loss": 0.7925, "step": 3418 }, { "epoch": 0.9410307575861832, "grad_norm": 0.1798245361167039, "learning_rate": 3.485158676265754e-07, "loss": 0.7947, "step": 3419 }, { "epoch": 0.941305993256726, "grad_norm": 0.1767865918196957, "learning_rate": 3.452757026775988e-07, "loss": 0.7784, "step": 3420 }, { "epoch": 0.9415812289272689, "grad_norm": 0.17620272589442937, "learning_rate": 3.4205053897375497e-07, "loss": 0.7361, "step": 3421 }, { "epoch": 0.9418564645978119, "grad_norm": 0.1801820070268112, "learning_rate": 3.3884037897661483e-07, "loss": 0.7692, "step": 3422 }, { "epoch": 0.9421317002683548, "grad_norm": 0.1799634706851007, "learning_rate": 3.3564522513629407e-07, "loss": 0.7474, "step": 3423 }, { "epoch": 0.9424069359388977, "grad_norm": 0.17840757969221127, "learning_rate": 3.324650798914597e-07, "loss": 0.7877, "step": 3424 }, { "epoch": 0.9426821716094406, "grad_norm": 0.18213285386900555, "learning_rate": 3.2929994566932134e-07, "loss": 0.7877, "step": 3425 }, { "epoch": 0.9429574072799835, "grad_norm": 0.1829158920280716, "learning_rate": 3.261498248856332e-07, "loss": 0.7691, "step": 3426 }, { "epoch": 0.9432326429505263, "grad_norm": 0.2714194726349643, "learning_rate": 3.2301471994468536e-07, "loss": 0.786, "step": 3427 }, { "epoch": 0.9435078786210693, "grad_norm": 0.17962525664904788, "learning_rate": 3.198946332393127e-07, "loss": 0.7743, "step": 3428 }, { "epoch": 0.9437831142916122, "grad_norm": 0.18276403812114503, "learning_rate": 3.167895671508903e-07, "loss": 0.7794, "step": 3429 }, { "epoch": 0.9440583499621551, "grad_norm": 0.17930935781084684, "learning_rate": 3.136995240493157e-07, "loss": 0.7577, "step": 3430 }, { "epoch": 0.944333585632698, "grad_norm": 0.18165700525680606, "learning_rate": 3.10624506293038e-07, "loss": 0.7765, "step": 3431 }, { "epoch": 0.9446088213032409, "grad_norm": 0.17966295155502715, "learning_rate": 3.0756451622902416e-07, "loss": 0.7672, "step": 3432 }, { "epoch": 0.9448840569737837, "grad_norm": 0.17842326449913629, "learning_rate": 3.0451955619278164e-07, "loss": 0.7696, "step": 3433 }, { "epoch": 0.9451592926443267, "grad_norm": 0.173822163120527, "learning_rate": 3.014896285083357e-07, "loss": 0.7393, "step": 3434 }, { "epoch": 0.9454345283148696, "grad_norm": 0.1794448576189847, "learning_rate": 2.984747354882456e-07, "loss": 0.7588, "step": 3435 }, { "epoch": 0.9457097639854125, "grad_norm": 0.18133635826751426, "learning_rate": 2.954748794335993e-07, "loss": 0.772, "step": 3436 }, { "epoch": 0.9459849996559554, "grad_norm": 0.17803707386819312, "learning_rate": 2.924900626339966e-07, "loss": 0.7848, "step": 3437 }, { "epoch": 0.9462602353264983, "grad_norm": 0.1784366672781347, "learning_rate": 2.895202873675684e-07, "loss": 0.7754, "step": 3438 }, { "epoch": 0.9465354709970412, "grad_norm": 0.18170351773250304, "learning_rate": 2.865655559009617e-07, "loss": 0.7824, "step": 3439 }, { "epoch": 0.9468107066675842, "grad_norm": 0.17819881192043055, "learning_rate": 2.836258704893391e-07, "loss": 0.7476, "step": 3440 }, { "epoch": 0.947085942338127, "grad_norm": 0.4997461614693514, "learning_rate": 2.807012333763881e-07, "loss": 0.8007, "step": 3441 }, { "epoch": 0.9473611780086699, "grad_norm": 0.1769510006550083, "learning_rate": 2.7779164679429873e-07, "loss": 0.7757, "step": 3442 }, { "epoch": 0.9476364136792128, "grad_norm": 0.34088402821254643, "learning_rate": 2.7489711296378343e-07, "loss": 0.7744, "step": 3443 }, { "epoch": 0.9479116493497557, "grad_norm": 0.17867595346551646, "learning_rate": 2.7201763409405726e-07, "loss": 0.7518, "step": 3444 }, { "epoch": 0.9481868850202986, "grad_norm": 0.18289283467859183, "learning_rate": 2.6915321238285773e-07, "loss": 0.7586, "step": 3445 }, { "epoch": 0.9484621206908416, "grad_norm": 0.18000393757159489, "learning_rate": 2.663038500164161e-07, "loss": 0.7891, "step": 3446 }, { "epoch": 0.9487373563613845, "grad_norm": 0.17811652924158544, "learning_rate": 2.634695491694772e-07, "loss": 0.7802, "step": 3447 }, { "epoch": 0.9490125920319273, "grad_norm": 0.1776492457184233, "learning_rate": 2.606503120052906e-07, "loss": 0.7593, "step": 3448 }, { "epoch": 0.9492878277024702, "grad_norm": 0.18636513662190413, "learning_rate": 2.578461406756061e-07, "loss": 0.7872, "step": 3449 }, { "epoch": 0.9495630633730131, "grad_norm": 0.17671353163445164, "learning_rate": 2.55057037320674e-07, "loss": 0.7709, "step": 3450 }, { "epoch": 0.9498382990435561, "grad_norm": 0.1802777302014596, "learning_rate": 2.52283004069247e-07, "loss": 0.7574, "step": 3451 }, { "epoch": 0.950113534714099, "grad_norm": 0.17787202683873812, "learning_rate": 2.495240430385737e-07, "loss": 0.7656, "step": 3452 }, { "epoch": 0.9503887703846419, "grad_norm": 0.17901545341346076, "learning_rate": 2.467801563344052e-07, "loss": 0.7717, "step": 3453 }, { "epoch": 0.9506640060551848, "grad_norm": 0.17597541158563862, "learning_rate": 2.4405134605097304e-07, "loss": 0.7845, "step": 3454 }, { "epoch": 0.9509392417257276, "grad_norm": 0.1797347844276362, "learning_rate": 2.4133761427101776e-07, "loss": 0.7682, "step": 3455 }, { "epoch": 0.9512144773962705, "grad_norm": 0.17984792534794747, "learning_rate": 2.386389630657604e-07, "loss": 0.7789, "step": 3456 }, { "epoch": 0.9514897130668135, "grad_norm": 0.18737390267781007, "learning_rate": 2.3595539449491778e-07, "loss": 0.7722, "step": 3457 }, { "epoch": 0.9517649487373564, "grad_norm": 0.17653028561183995, "learning_rate": 2.332869106066915e-07, "loss": 0.7541, "step": 3458 }, { "epoch": 0.9520401844078993, "grad_norm": 0.1803865835967897, "learning_rate": 2.3063351343777241e-07, "loss": 0.7788, "step": 3459 }, { "epoch": 0.9523154200784422, "grad_norm": 0.17817562075030466, "learning_rate": 2.2799520501333606e-07, "loss": 0.7723, "step": 3460 }, { "epoch": 0.952590655748985, "grad_norm": 0.17930027186013844, "learning_rate": 2.253719873470406e-07, "loss": 0.7823, "step": 3461 }, { "epoch": 0.9528658914195279, "grad_norm": 0.17834616787129215, "learning_rate": 2.2276386244102888e-07, "loss": 0.7683, "step": 3462 }, { "epoch": 0.9531411270900709, "grad_norm": 0.17608618430327727, "learning_rate": 2.2017083228592195e-07, "loss": 0.768, "step": 3463 }, { "epoch": 0.9534163627606138, "grad_norm": 0.1818134089880582, "learning_rate": 2.1759289886081892e-07, "loss": 0.7905, "step": 3464 }, { "epoch": 0.9536915984311567, "grad_norm": 0.1801450580653961, "learning_rate": 2.1503006413330142e-07, "loss": 0.7651, "step": 3465 }, { "epoch": 0.9539668341016996, "grad_norm": 0.3924989381263502, "learning_rate": 2.124823300594181e-07, "loss": 0.7706, "step": 3466 }, { "epoch": 0.9542420697722425, "grad_norm": 0.17912647516824165, "learning_rate": 2.0994969858370463e-07, "loss": 0.7426, "step": 3467 }, { "epoch": 0.9545173054427853, "grad_norm": 0.1745049111177619, "learning_rate": 2.074321716391614e-07, "loss": 0.7413, "step": 3468 }, { "epoch": 0.9547925411133283, "grad_norm": 0.18118406596132217, "learning_rate": 2.049297511472581e-07, "loss": 0.7878, "step": 3469 }, { "epoch": 0.9550677767838712, "grad_norm": 0.1803089579448522, "learning_rate": 2.024424390179447e-07, "loss": 0.7702, "step": 3470 }, { "epoch": 0.9553430124544141, "grad_norm": 0.177746866281357, "learning_rate": 1.999702371496315e-07, "loss": 0.7828, "step": 3471 }, { "epoch": 0.955618248124957, "grad_norm": 0.17813758130928734, "learning_rate": 1.975131474291958e-07, "loss": 0.7779, "step": 3472 }, { "epoch": 0.9558934837954999, "grad_norm": 0.1767147746377242, "learning_rate": 1.9507117173198864e-07, "loss": 0.7582, "step": 3473 }, { "epoch": 0.9561687194660428, "grad_norm": 0.25456672245448575, "learning_rate": 1.9264431192181466e-07, "loss": 0.7773, "step": 3474 }, { "epoch": 0.9564439551365858, "grad_norm": 0.1789887411926834, "learning_rate": 1.9023256985095217e-07, "loss": 0.7461, "step": 3475 }, { "epoch": 0.9567191908071286, "grad_norm": 0.18163546011577508, "learning_rate": 1.8783594736013322e-07, "loss": 0.7768, "step": 3476 }, { "epoch": 0.9569944264776715, "grad_norm": 0.2702597219081077, "learning_rate": 1.8545444627855236e-07, "loss": 0.7632, "step": 3477 }, { "epoch": 0.9572696621482144, "grad_norm": 0.2997512889466131, "learning_rate": 1.830880684238645e-07, "loss": 0.7465, "step": 3478 }, { "epoch": 0.9575448978187573, "grad_norm": 0.2711901813937954, "learning_rate": 1.8073681560218047e-07, "loss": 0.77, "step": 3479 }, { "epoch": 0.9578201334893002, "grad_norm": 0.18179524960436003, "learning_rate": 1.78400689608067e-07, "loss": 0.767, "step": 3480 }, { "epoch": 0.9580953691598432, "grad_norm": 0.1787474371453518, "learning_rate": 1.7607969222454446e-07, "loss": 0.737, "step": 3481 }, { "epoch": 0.958370604830386, "grad_norm": 0.17646985645336227, "learning_rate": 1.7377382522309138e-07, "loss": 0.7675, "step": 3482 }, { "epoch": 0.9586458405009289, "grad_norm": 0.1772746613899236, "learning_rate": 1.714830903636311e-07, "loss": 0.7884, "step": 3483 }, { "epoch": 0.9589210761714718, "grad_norm": 0.17532594677389737, "learning_rate": 1.6920748939454058e-07, "loss": 0.7789, "step": 3484 }, { "epoch": 0.9591963118420147, "grad_norm": 0.1725185422582348, "learning_rate": 1.669470240526505e-07, "loss": 0.7441, "step": 3485 }, { "epoch": 0.9594715475125576, "grad_norm": 0.1832051702657459, "learning_rate": 1.6470169606323193e-07, "loss": 0.7707, "step": 3486 }, { "epoch": 0.9597467831831006, "grad_norm": 0.18308635605218143, "learning_rate": 1.6247150714000514e-07, "loss": 0.7884, "step": 3487 }, { "epoch": 0.9600220188536435, "grad_norm": 0.1780212599011205, "learning_rate": 1.6025645898513963e-07, "loss": 0.7769, "step": 3488 }, { "epoch": 0.9602972545241864, "grad_norm": 0.17867323891786976, "learning_rate": 1.5805655328924308e-07, "loss": 0.7613, "step": 3489 }, { "epoch": 0.9605724901947292, "grad_norm": 0.1817119219295843, "learning_rate": 1.5587179173137234e-07, "loss": 0.7804, "step": 3490 }, { "epoch": 0.9608477258652721, "grad_norm": 0.18151539362345998, "learning_rate": 1.5370217597901805e-07, "loss": 0.7813, "step": 3491 }, { "epoch": 0.961122961535815, "grad_norm": 0.1810355088739828, "learning_rate": 1.5154770768811556e-07, "loss": 0.7612, "step": 3492 }, { "epoch": 0.961398197206358, "grad_norm": 0.18718055180850396, "learning_rate": 1.4940838850304063e-07, "loss": 0.7784, "step": 3493 }, { "epoch": 0.9616734328769009, "grad_norm": 0.17661636434505176, "learning_rate": 1.4728422005660048e-07, "loss": 0.7577, "step": 3494 }, { "epoch": 0.9619486685474438, "grad_norm": 0.17718307154213916, "learning_rate": 1.4517520397004492e-07, "loss": 0.7757, "step": 3495 }, { "epoch": 0.9622239042179866, "grad_norm": 0.1778218613606195, "learning_rate": 1.4308134185305522e-07, "loss": 0.7676, "step": 3496 }, { "epoch": 0.9624991398885295, "grad_norm": 0.18337886620272192, "learning_rate": 1.4100263530375081e-07, "loss": 0.7889, "step": 3497 }, { "epoch": 0.9627743755590724, "grad_norm": 0.18216361767310676, "learning_rate": 1.3893908590867811e-07, "loss": 0.7822, "step": 3498 }, { "epoch": 0.9630496112296154, "grad_norm": 0.1798065585331738, "learning_rate": 1.3689069524281728e-07, "loss": 0.7625, "step": 3499 }, { "epoch": 0.9633248469001583, "grad_norm": 0.17296336534665763, "learning_rate": 1.3485746486958217e-07, "loss": 0.7485, "step": 3500 }, { "epoch": 0.9636000825707012, "grad_norm": 0.17646303216234493, "learning_rate": 1.3283939634081143e-07, "loss": 0.7751, "step": 3501 }, { "epoch": 0.9638753182412441, "grad_norm": 0.18295128543702405, "learning_rate": 1.3083649119677078e-07, "loss": 0.7844, "step": 3502 }, { "epoch": 0.964150553911787, "grad_norm": 0.18151638905946132, "learning_rate": 1.2884875096615734e-07, "loss": 0.7903, "step": 3503 }, { "epoch": 0.9644257895823298, "grad_norm": 0.18140173990630296, "learning_rate": 1.2687617716609092e-07, "loss": 0.775, "step": 3504 }, { "epoch": 0.9647010252528728, "grad_norm": 0.17404082168705917, "learning_rate": 1.2491877130211606e-07, "loss": 0.7329, "step": 3505 }, { "epoch": 0.9649762609234157, "grad_norm": 0.2732951700976934, "learning_rate": 1.2297653486819994e-07, "loss": 0.7756, "step": 3506 }, { "epoch": 0.9652514965939586, "grad_norm": 0.17879034445366826, "learning_rate": 1.2104946934673235e-07, "loss": 0.7718, "step": 3507 }, { "epoch": 0.9655267322645015, "grad_norm": 0.17793241131942597, "learning_rate": 1.1913757620852562e-07, "loss": 0.7533, "step": 3508 }, { "epoch": 0.9658019679350444, "grad_norm": 0.18249359033078905, "learning_rate": 1.1724085691280806e-07, "loss": 0.7843, "step": 3509 }, { "epoch": 0.9660772036055872, "grad_norm": 0.17887088090366485, "learning_rate": 1.1535931290723057e-07, "loss": 0.7836, "step": 3510 }, { "epoch": 0.9663524392761302, "grad_norm": 0.17649617617448546, "learning_rate": 1.1349294562786217e-07, "loss": 0.7488, "step": 3511 }, { "epoch": 0.9666276749466731, "grad_norm": 0.17699676701245734, "learning_rate": 1.1164175649918341e-07, "loss": 0.7714, "step": 3512 }, { "epoch": 0.966902910617216, "grad_norm": 0.1802940285321163, "learning_rate": 1.0980574693409295e-07, "loss": 0.7283, "step": 3513 }, { "epoch": 0.9671781462877589, "grad_norm": 0.1810028532172541, "learning_rate": 1.0798491833390767e-07, "loss": 0.7777, "step": 3514 }, { "epoch": 0.9674533819583018, "grad_norm": 0.18222466272505522, "learning_rate": 1.0617927208835143e-07, "loss": 0.7573, "step": 3515 }, { "epoch": 0.9677286176288447, "grad_norm": 0.17622547495446858, "learning_rate": 1.0438880957556408e-07, "loss": 0.7675, "step": 3516 }, { "epoch": 0.9680038532993877, "grad_norm": 0.17456426789450077, "learning_rate": 1.0261353216209691e-07, "loss": 0.7666, "step": 3517 }, { "epoch": 0.9682790889699305, "grad_norm": 0.6731092038117731, "learning_rate": 1.008534412029083e-07, "loss": 0.8104, "step": 3518 }, { "epoch": 0.9685543246404734, "grad_norm": 0.18001065930138233, "learning_rate": 9.910853804137033e-08, "loss": 0.7596, "step": 3519 }, { "epoch": 0.9688295603110163, "grad_norm": 0.17976827811068685, "learning_rate": 9.737882400925768e-08, "loss": 0.7746, "step": 3520 }, { "epoch": 0.9691047959815592, "grad_norm": 0.17377817363019726, "learning_rate": 9.566430042675657e-08, "loss": 0.741, "step": 3521 }, { "epoch": 0.9693800316521021, "grad_norm": 0.1790577389207181, "learning_rate": 9.396496860245797e-08, "loss": 0.7662, "step": 3522 }, { "epoch": 0.9696552673226451, "grad_norm": 0.18103421902489195, "learning_rate": 9.228082983335329e-08, "loss": 0.7685, "step": 3523 }, { "epoch": 0.969930502993188, "grad_norm": 0.17549445706841693, "learning_rate": 9.061188540484989e-08, "loss": 0.7515, "step": 3524 }, { "epoch": 0.9702057386637308, "grad_norm": 0.18297146952209412, "learning_rate": 8.895813659074437e-08, "loss": 0.8175, "step": 3525 }, { "epoch": 0.9704809743342737, "grad_norm": 0.17487822209097756, "learning_rate": 8.731958465324486e-08, "loss": 0.755, "step": 3526 }, { "epoch": 0.9707562100048166, "grad_norm": 0.1782621251590202, "learning_rate": 8.569623084295541e-08, "loss": 0.7545, "step": 3527 }, { "epoch": 0.9710314456753595, "grad_norm": 0.17901393418751282, "learning_rate": 8.408807639888494e-08, "loss": 0.7656, "step": 3528 }, { "epoch": 0.9713066813459025, "grad_norm": 0.18147513421709688, "learning_rate": 8.249512254843827e-08, "loss": 0.7861, "step": 3529 }, { "epoch": 0.9715819170164454, "grad_norm": 0.17957197077444595, "learning_rate": 8.091737050741621e-08, "loss": 0.7782, "step": 3530 }, { "epoch": 0.9718571526869882, "grad_norm": 0.17672929151204175, "learning_rate": 7.93548214800266e-08, "loss": 0.7597, "step": 3531 }, { "epoch": 0.9721323883575311, "grad_norm": 0.4349985677603449, "learning_rate": 7.78074766588599e-08, "loss": 0.7466, "step": 3532 }, { "epoch": 0.972407624028074, "grad_norm": 0.17633210395280272, "learning_rate": 7.627533722491364e-08, "loss": 0.7701, "step": 3533 }, { "epoch": 0.9726828596986169, "grad_norm": 0.17672996696366425, "learning_rate": 7.475840434757686e-08, "loss": 0.7604, "step": 3534 }, { "epoch": 0.9729580953691599, "grad_norm": 0.17797749847629954, "learning_rate": 7.325667918462787e-08, "loss": 0.7608, "step": 3535 }, { "epoch": 0.9732333310397028, "grad_norm": 0.18212680472703546, "learning_rate": 7.177016288224315e-08, "loss": 0.7865, "step": 3536 }, { "epoch": 0.9735085667102457, "grad_norm": 0.1797596592006918, "learning_rate": 7.02988565749907e-08, "loss": 0.7647, "step": 3537 }, { "epoch": 0.9737838023807885, "grad_norm": 0.33927059362669343, "learning_rate": 6.884276138582557e-08, "loss": 0.8035, "step": 3538 }, { "epoch": 0.9740590380513314, "grad_norm": 0.17928149815434763, "learning_rate": 6.74018784260988e-08, "loss": 0.77, "step": 3539 }, { "epoch": 0.9743342737218743, "grad_norm": 0.18172923679065384, "learning_rate": 6.597620879554623e-08, "loss": 0.787, "step": 3540 }, { "epoch": 0.9746095093924173, "grad_norm": 0.17697969705309036, "learning_rate": 6.4565753582293e-08, "loss": 0.7558, "step": 3541 }, { "epoch": 0.9748847450629602, "grad_norm": 0.17581623954789408, "learning_rate": 6.317051386285356e-08, "loss": 0.7744, "step": 3542 }, { "epoch": 0.9751599807335031, "grad_norm": 0.18288959489294285, "learning_rate": 6.179049070213161e-08, "loss": 0.7987, "step": 3543 }, { "epoch": 0.975435216404046, "grad_norm": 0.17634802116204348, "learning_rate": 6.04256851534113e-08, "loss": 0.766, "step": 3544 }, { "epoch": 0.9757104520745888, "grad_norm": 0.17673599042413227, "learning_rate": 5.90760982583638e-08, "loss": 0.7655, "step": 3545 }, { "epoch": 0.9759856877451317, "grad_norm": 0.1820519779333532, "learning_rate": 5.774173104705183e-08, "loss": 0.7916, "step": 3546 }, { "epoch": 0.9762609234156747, "grad_norm": 0.175861718108478, "learning_rate": 5.642258453790961e-08, "loss": 0.7453, "step": 3547 }, { "epoch": 0.9765361590862176, "grad_norm": 0.17486506031221477, "learning_rate": 5.511865973776287e-08, "loss": 0.7406, "step": 3548 }, { "epoch": 0.9768113947567605, "grad_norm": 0.1780123599448829, "learning_rate": 5.382995764181775e-08, "loss": 0.7565, "step": 3549 }, { "epoch": 0.9770866304273034, "grad_norm": 0.17599109803309804, "learning_rate": 5.2556479233663026e-08, "loss": 0.7611, "step": 3550 }, { "epoch": 0.9773618660978463, "grad_norm": 0.17667839153089893, "learning_rate": 5.129822548526342e-08, "loss": 0.7545, "step": 3551 }, { "epoch": 0.9776371017683891, "grad_norm": 0.17280983449231263, "learning_rate": 5.005519735696851e-08, "loss": 0.7479, "step": 3552 }, { "epoch": 0.9779123374389321, "grad_norm": 0.176523232961457, "learning_rate": 4.882739579750606e-08, "loss": 0.7672, "step": 3553 }, { "epoch": 0.978187573109475, "grad_norm": 0.18141590619449227, "learning_rate": 4.761482174398202e-08, "loss": 0.7813, "step": 3554 }, { "epoch": 0.9784628087800179, "grad_norm": 0.17584721367548525, "learning_rate": 4.641747612187608e-08, "loss": 0.7656, "step": 3555 }, { "epoch": 0.9787380444505608, "grad_norm": 0.17578752767897704, "learning_rate": 4.523535984505278e-08, "loss": 0.7694, "step": 3556 }, { "epoch": 0.9790132801211037, "grad_norm": 0.17692227791635268, "learning_rate": 4.406847381574819e-08, "loss": 0.7762, "step": 3557 }, { "epoch": 0.9792885157916466, "grad_norm": 0.17881547873142664, "learning_rate": 4.291681892457211e-08, "loss": 0.7646, "step": 3558 }, { "epoch": 0.9795637514621895, "grad_norm": 0.17752552599136098, "learning_rate": 4.178039605051698e-08, "loss": 0.7883, "step": 3559 }, { "epoch": 0.9798389871327324, "grad_norm": 0.17557339109858847, "learning_rate": 4.065920606093787e-08, "loss": 0.7636, "step": 3560 }, { "epoch": 0.9801142228032753, "grad_norm": 0.17574774842950883, "learning_rate": 3.9553249811576936e-08, "loss": 0.7775, "step": 3561 }, { "epoch": 0.9803894584738182, "grad_norm": 0.1772844194397064, "learning_rate": 3.846252814654117e-08, "loss": 0.7571, "step": 3562 }, { "epoch": 0.9806646941443611, "grad_norm": 0.17729271362679233, "learning_rate": 3.738704189830689e-08, "loss": 0.7679, "step": 3563 }, { "epoch": 0.980939929814904, "grad_norm": 0.17909654849099269, "learning_rate": 3.632679188773303e-08, "loss": 0.7684, "step": 3564 }, { "epoch": 0.981215165485447, "grad_norm": 0.1833268540809398, "learning_rate": 3.528177892403894e-08, "loss": 0.8016, "step": 3565 }, { "epoch": 0.9814904011559898, "grad_norm": 0.178510730996882, "learning_rate": 3.425200380481997e-08, "loss": 0.7641, "step": 3566 }, { "epoch": 0.9817656368265327, "grad_norm": 0.1820364095197272, "learning_rate": 3.3237467316042937e-08, "loss": 0.7578, "step": 3567 }, { "epoch": 0.9820408724970756, "grad_norm": 0.18138425675425987, "learning_rate": 3.2238170232037346e-08, "loss": 0.7832, "step": 3568 }, { "epoch": 0.9823161081676185, "grad_norm": 0.17304657211950922, "learning_rate": 3.125411331550643e-08, "loss": 0.7526, "step": 3569 }, { "epoch": 0.9825913438381614, "grad_norm": 0.17567294905646494, "learning_rate": 3.028529731752272e-08, "loss": 0.754, "step": 3570 }, { "epoch": 0.9828665795087044, "grad_norm": 0.17489419630595576, "learning_rate": 2.9331722977523625e-08, "loss": 0.7554, "step": 3571 }, { "epoch": 0.9831418151792473, "grad_norm": 0.17745684383307575, "learning_rate": 2.83933910233114e-08, "loss": 0.759, "step": 3572 }, { "epoch": 0.9834170508497901, "grad_norm": 0.17206565807239377, "learning_rate": 2.7470302171057616e-08, "loss": 0.7384, "step": 3573 }, { "epoch": 0.983692286520333, "grad_norm": 0.17641064197005765, "learning_rate": 2.6562457125300922e-08, "loss": 0.7616, "step": 3574 }, { "epoch": 0.9839675221908759, "grad_norm": 0.177807913042134, "learning_rate": 2.566985657894483e-08, "loss": 0.7653, "step": 3575 }, { "epoch": 0.9842427578614188, "grad_norm": 0.1790946374687885, "learning_rate": 2.4792501213253272e-08, "loss": 0.7548, "step": 3576 }, { "epoch": 0.9845179935319618, "grad_norm": 0.1780544315960312, "learning_rate": 2.393039169785949e-08, "loss": 0.7847, "step": 3577 }, { "epoch": 0.9847932292025047, "grad_norm": 0.17834656751545258, "learning_rate": 2.308352869075936e-08, "loss": 0.7577, "step": 3578 }, { "epoch": 0.9850684648730476, "grad_norm": 0.1790462243194874, "learning_rate": 2.2251912838311408e-08, "loss": 0.7698, "step": 3579 }, { "epoch": 0.9853437005435904, "grad_norm": 0.18434661086620202, "learning_rate": 2.1435544775234574e-08, "loss": 0.772, "step": 3580 }, { "epoch": 0.9856189362141333, "grad_norm": 0.1764678047623265, "learning_rate": 2.0634425124614886e-08, "loss": 0.7903, "step": 3581 }, { "epoch": 0.9858941718846762, "grad_norm": 0.1906681654294173, "learning_rate": 1.98485544978988e-08, "loss": 0.7519, "step": 3582 }, { "epoch": 0.9861694075552192, "grad_norm": 0.1744191667294603, "learning_rate": 1.9077933494888733e-08, "loss": 0.7709, "step": 3583 }, { "epoch": 0.9864446432257621, "grad_norm": 0.1760264371211054, "learning_rate": 1.8322562703758652e-08, "loss": 0.7524, "step": 3584 }, { "epoch": 0.986719878896305, "grad_norm": 0.1804041052742331, "learning_rate": 1.758244270103182e-08, "loss": 0.7555, "step": 3585 }, { "epoch": 0.9869951145668479, "grad_norm": 0.17425152073973565, "learning_rate": 1.68575740515986e-08, "loss": 0.7536, "step": 3586 }, { "epoch": 0.9872703502373907, "grad_norm": 0.18022990927311366, "learning_rate": 1.614795730870311e-08, "loss": 0.7884, "step": 3587 }, { "epoch": 0.9875455859079336, "grad_norm": 0.1692149686164318, "learning_rate": 1.545359301395877e-08, "loss": 0.7498, "step": 3588 }, { "epoch": 0.9878208215784766, "grad_norm": 0.1787623881772803, "learning_rate": 1.4774481697326093e-08, "loss": 0.7754, "step": 3589 }, { "epoch": 0.9880960572490195, "grad_norm": 0.17906661051327927, "learning_rate": 1.411062387713269e-08, "loss": 0.7665, "step": 3590 }, { "epoch": 0.9883712929195624, "grad_norm": 0.18088041897608118, "learning_rate": 1.3462020060057701e-08, "loss": 0.7732, "step": 3591 }, { "epoch": 0.9886465285901053, "grad_norm": 0.5259410023169809, "learning_rate": 1.2828670741140693e-08, "loss": 0.7695, "step": 3592 }, { "epoch": 0.9889217642606482, "grad_norm": 0.17698042182388626, "learning_rate": 1.2210576403779428e-08, "loss": 0.7936, "step": 3593 }, { "epoch": 0.989196999931191, "grad_norm": 0.1736793906227136, "learning_rate": 1.1607737519727658e-08, "loss": 0.7464, "step": 3594 }, { "epoch": 0.989472235601734, "grad_norm": 0.17547954770090057, "learning_rate": 1.1020154549095108e-08, "loss": 0.7372, "step": 3595 }, { "epoch": 0.9897474712722769, "grad_norm": 0.1793344090138556, "learning_rate": 1.0447827940345268e-08, "loss": 0.7577, "step": 3596 }, { "epoch": 0.9900227069428198, "grad_norm": 0.17520452136961606, "learning_rate": 9.890758130304268e-09, "loss": 0.7566, "step": 3597 }, { "epoch": 0.9902979426133627, "grad_norm": 0.18144790080034406, "learning_rate": 9.348945544147558e-09, "loss": 0.7877, "step": 3598 }, { "epoch": 0.9905731782839056, "grad_norm": 0.181632924815335, "learning_rate": 8.822390595404352e-09, "loss": 0.7832, "step": 3599 }, { "epoch": 0.9908484139544484, "grad_norm": 0.17877247539950303, "learning_rate": 8.311093685966498e-09, "loss": 0.7779, "step": 3600 }, { "epoch": 0.9911236496249914, "grad_norm": 0.17896780550493493, "learning_rate": 7.815055206072952e-09, "loss": 0.7878, "step": 3601 }, { "epoch": 0.9913988852955343, "grad_norm": 0.17442436243710627, "learning_rate": 7.3342755343208674e-09, "loss": 0.7653, "step": 3602 }, { "epoch": 0.9916741209660772, "grad_norm": 0.17777321615401723, "learning_rate": 6.868755037658937e-09, "loss": 0.7767, "step": 3603 }, { "epoch": 0.9919493566366201, "grad_norm": 0.17503196499366, "learning_rate": 6.418494071389614e-09, "loss": 0.7746, "step": 3604 }, { "epoch": 0.992224592307163, "grad_norm": 0.17898836174974275, "learning_rate": 5.983492979171335e-09, "loss": 0.7788, "step": 3605 }, { "epoch": 0.9924998279777059, "grad_norm": 0.18116771238038662, "learning_rate": 5.563752093011854e-09, "loss": 0.7624, "step": 3606 }, { "epoch": 0.9927750636482489, "grad_norm": 0.17698711904140782, "learning_rate": 5.159271733274907e-09, "loss": 0.786, "step": 3607 }, { "epoch": 0.9930502993187917, "grad_norm": 0.17765865208863188, "learning_rate": 4.770052208673548e-09, "loss": 0.7738, "step": 3608 }, { "epoch": 0.9933255349893346, "grad_norm": 0.17657751738507468, "learning_rate": 4.396093816279035e-09, "loss": 0.7707, "step": 3609 }, { "epoch": 0.9936007706598775, "grad_norm": 0.17454547847072047, "learning_rate": 4.037396841507501e-09, "loss": 0.7575, "step": 3610 }, { "epoch": 0.9938760063304204, "grad_norm": 0.17714898000454996, "learning_rate": 3.693961558131065e-09, "loss": 0.7731, "step": 3611 }, { "epoch": 0.9941512420009633, "grad_norm": 0.17622566153292368, "learning_rate": 3.3657882282733812e-09, "loss": 0.7573, "step": 3612 }, { "epoch": 0.9944264776715063, "grad_norm": 0.17709442785780347, "learning_rate": 3.052877102409646e-09, "loss": 0.7718, "step": 3613 }, { "epoch": 0.9947017133420492, "grad_norm": 0.1791136045353406, "learning_rate": 2.755228419364375e-09, "loss": 0.8098, "step": 3614 }, { "epoch": 0.994976949012592, "grad_norm": 0.18038236847042155, "learning_rate": 2.472842406315845e-09, "loss": 0.7667, "step": 3615 }, { "epoch": 0.9952521846831349, "grad_norm": 0.17588115954026287, "learning_rate": 2.205719278789431e-09, "loss": 0.747, "step": 3616 }, { "epoch": 0.9955274203536778, "grad_norm": 0.17653906931286567, "learning_rate": 1.9538592406664892e-09, "loss": 0.7664, "step": 3617 }, { "epoch": 0.9958026560242207, "grad_norm": 0.5789252284335775, "learning_rate": 1.7172624841754748e-09, "loss": 0.7929, "step": 3618 }, { "epoch": 0.9960778916947637, "grad_norm": 0.1763033312587939, "learning_rate": 1.4959291898963836e-09, "loss": 0.745, "step": 3619 }, { "epoch": 0.9963531273653066, "grad_norm": 0.17840002630427262, "learning_rate": 1.2898595267585301e-09, "loss": 0.7756, "step": 3620 }, { "epoch": 0.9966283630358495, "grad_norm": 0.1767728346272913, "learning_rate": 1.0990536520427696e-09, "loss": 0.7875, "step": 3621 }, { "epoch": 0.9969035987063923, "grad_norm": 0.1698190573293454, "learning_rate": 9.235117113792768e-10, "loss": 0.7447, "step": 3622 }, { "epoch": 0.9971788343769352, "grad_norm": 0.17788327054454803, "learning_rate": 7.632338387497662e-10, "loss": 0.7855, "step": 3623 }, { "epoch": 0.9974540700474781, "grad_norm": 0.17666564044937066, "learning_rate": 6.182201564830514e-10, "loss": 0.7556, "step": 3624 }, { "epoch": 0.9977293057180211, "grad_norm": 0.1769705344619, "learning_rate": 4.884707752594864e-10, "loss": 0.7673, "step": 3625 }, { "epoch": 0.998004541388564, "grad_norm": 0.20211506220176842, "learning_rate": 3.739857941087444e-10, "loss": 0.7606, "step": 3626 }, { "epoch": 0.9982797770591069, "grad_norm": 0.17917807655749818, "learning_rate": 2.747653004098183e-10, "loss": 0.7782, "step": 3627 }, { "epoch": 0.9985550127296497, "grad_norm": 0.18205947891349653, "learning_rate": 1.9080936989324117e-10, "loss": 0.7419, "step": 3628 }, { "epoch": 0.9988302484001926, "grad_norm": 0.17364769553726467, "learning_rate": 1.221180666344246e-10, "loss": 0.7561, "step": 3629 }, { "epoch": 0.9991054840707355, "grad_norm": 0.16925193408970945, "learning_rate": 6.869144306476117e-11, "loss": 0.7399, "step": 3630 }, { "epoch": 0.9993807197412785, "grad_norm": 0.21789214395467008, "learning_rate": 3.0529539960522104e-11, "loss": 0.7566, "step": 3631 }, { "epoch": 0.9996559554118214, "grad_norm": 0.1819224016534888, "learning_rate": 7.632386447298245e-12, "loss": 0.7667, "step": 3632 }, { "epoch": 0.9999311910823643, "grad_norm": 0.17616693190335297, "learning_rate": 0.0, "loss": 0.7571, "step": 3633 }, { "epoch": 0.9999311910823643, "step": 3633, "total_flos": 3475585917517824.0, "train_loss": 0.8154109569577649, "train_runtime": 36551.3329, "train_samples_per_second": 57.255, "train_steps_per_second": 0.099 } ], "logging_steps": 1, "max_steps": 3633, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3475585917517824.0, "train_batch_size": 9, "trial_name": null, "trial_params": null }